From f8157e5618bc7d4decc9bea4248321cbe270dbb6 Mon Sep 17 00:00:00 2001 From: Mihai Stere Date: Thu, 26 Jun 2025 00:52:50 +0300 Subject: [PATCH] Mihai Stere feature compression changes --- README.md | 12 +- src/smlp_py/smlp_data.py | 154 ++++++++++++++++++++---- src/smlp_py/smlp_flows.py | 11 +- src/smlp_py/smlp_mrmr.py | 7 +- src/smlp_py/smlp_optimize.py | 6 +- src/smlp_py/smlp_pca.py | 225 +++++++++++++++++++++++++++++++++++ src/smlp_py/smlp_shap.py | 68 +++++++++++ src/smlp_py/smlp_terms.py | 32 +++-- src/smlp_py/train_sklearn.py | 2 + src/smlp_train.py | 6 +- 10 files changed, 467 insertions(+), 56 deletions(-) create mode 100644 src/smlp_py/smlp_pca.py create mode 100644 src/smlp_py/smlp_shap.py diff --git a/README.md b/README.md index 1bd0d4d5..27fdaa43 100644 --- a/README.md +++ b/README.md @@ -53,13 +53,13 @@ instruction for the installation on Ubuntu. # first the tool itself ../../src/run_smlp.py -data "../data/smlp_toy_num_resp_mult" \ - -out_dir ./ -pref Test83 -mode optimize -pareto t -sat_thresh f \ + -out_dir ./ -pref Test83 -mode optimize -pareto t \ -resp y1,y2 -feat x,p1,p2 -model dt_sklearn -dt_sklearn_max_depth 15 \ -spec smlp_toy_num_resp_mult_free_inps -data_scaler min_max \ -beta "y1>7 and y2>6" -objv_names obj1,objv2,objv3 \ -objv_exprs "(y1+y2)/2;y1/2-y2;y2" -epsilon 0.05 -delta_rel 0.01 \ - -save_model_config f -mrmr_pred 0 -plots f -seed 10 -log_time f \ - -spec ../specs/smlp_toy_num_resp_mult_free_inps + -save_model_config f -plots f -seed 10 -log_time f \ + -spec ../specs/smlp_toy_num_resp_mult_free_inps.spec # then the regression script ./smlp_regr.py -w 1 -def n -t 88 -tol 5 @@ -98,8 +98,8 @@ will produce the SMLP command for the regression test number 1: ../../src/run_smlp.py -data "../data/smlp_toy_num_resp_mult" \ -out_dir ./ -pref Test1 -mode train -resp y1 -feat x,p1,p2 \ - -model dt_caret -save_model_config f -mrmr_pred 0 -plots f \ - -seed 10 -log_time f + -model dt_caret -save_model_config f -feat_select_model mrmr \ + -feat_select_count 0 -plots f -seed 10 -log_time f For details about those parameters, please refer to the help messages (-h) of both tools, src/run_smlp.py and regr_smlp/code/smlp_regr.py, as well as the @@ -159,12 +159,14 @@ directory, run the following commands: -mode optimize -pareto f -sat_thresh f -resp o0 -feat \ Byte,CH,RANK,Timing,i0,i1,i2,i3 -model dt_sklearn -dt_sklearn_max_depth 15 \ -data_scaler min_max -epsilon 0.05 -log_time f -plots f \ + -feat_select_model mrmr -feat_select_count 15 \ -spec ../specs/smlp_s2_tx ../../src/run_smlp.py -out_dir ./ -pref smlp_s2_tx_nn -data ../data/smlp_s2_tx \ -mode optimize -pareto f -sat_thresh f -resp o0 \ -feat Byte,CH,RANK,Timing,i0,i1,i2,i3 \ -model nn_keras -nn_keras_epochs 20 -data_scaler min_max \ + -feat_select_model mrmr -feat_select_count 15 \ -epsilon 0.05 -log_time f -plots f -spec ../specs/smlp_s2_tx These runs will take longer than the regression tests provided earlier, diff --git a/src/smlp_py/smlp_data.py b/src/smlp_py/smlp_data.py index eb5d6cb1..309819d7 100644 --- a/src/smlp_py/smlp_data.py +++ b/src/smlp_py/smlp_data.py @@ -18,6 +18,9 @@ from smlp_py.smlp_utils import (np_JSONEncoder, list_intersection, str_to_bool, list_unique_unordered, lists_union_order_preserving_without_duplicates, get_response_type, cast_type, pd_df_col_is_numeric) from smlp_py.smlp_mrmr import SmlpMrmr +from smlp_py.smlp_shap import SmlpShap +from smlp_py.smlp_pca import SmlpPCA +#from smlp_py.smlp_spec import SmlpSpec #from smlp_py.smlp_spec import SmlpSpec from smlp_py.smlp_constants import * from smlp_py.smlp_discretize import SmlpDiscretize @@ -35,8 +38,10 @@ def __init__(self): # and in that case we might need to instantiate MrmrFeatures at a top level script (like smslp_train.py) # along with instantiating DataCommon (and maybe ModelsCommon). self._mrmrInst = SmlpMrmr() + self._shapInst = SmlpShap() + self._pcaInst = SmlpPCA() self._specInst = None # SmlpSpec() - + # default values of parameters related to dataset; used to generate args. self._DEF_TRAIN_FIRST = 0 # subseting first_n rows from training data self._DEF_TRAIN_RAND = 0 # sampling random_n rows from training data @@ -65,6 +70,10 @@ def __init__(self): # features that must be used in training models (assuming they are within initially defined input features) self._DEF_KEEP_FEATURES = [] + + # SMLP feature selection default parameters on model used and feature counts (off by default) + self._DEF_FEATURE_SELECTION = 'mrmr' + self._DEF_FEATURE_COUNT = 0 # Dictionary with features (names) that have a missing values as dictionary keys and row indices of the # missing values as dictionary values. It is computed just before imputing missing values in features. @@ -264,8 +273,18 @@ def __init__(self): 'response_plots': {'abbr':'resp_plots', 'default': self._DEF_RESPONSE_PLOTS, 'type':str_to_bool, 'help': 'Should response value distribution plots be genrated during data processing? ' + 'A related option interactive_plots controls whether the generated plots should be ' + - 'displayed interactively during runtime [default: ' + str(self._DEF_RESPONSE_PLOTS) + ']'} - } | self._mrmrInst.mrmr_params_dict + 'displayed interactively during runtime [default: ' + str(self._DEF_RESPONSE_PLOTS) + ']'}, + 'feature_selection_model': {'abbr':'feat_select_model', 'default': self._DEF_FEATURE_SELECTION, 'type':str, + 'help': 'What feature selection model do you want to use? ' + + 'Can choose between mrmr, shap, mrmr_shap or mrmr_shap_ensemble meaning the model can ' + + 'use just mrmr, just shap, mrmr then shap, or both scored combined' + + ' in order to select features [default: ' + str(self._DEF_FEATURE_SELECTION) + ']'}, + 'feature_selection_count': {'abbr':'feat_select_count', 'default': self._DEF_FEATURE_COUNT, 'type':int, + 'help': 'The amount of features the feature selection model should take into account. ' + + 'If mrmr_shap is chosen as the model, then the number of features will first be chosen by mrmr, then by shap ' + + 'and then listed together, and if mrmr_shap_ensemble is chosen, then the importance values will be combined ' + + 'and then features are chosen [default: ' + str(self._DEF_FEATURE_COUNT) + ']'} + } | self._pcaInst.pca_params_dict self.data_bounds_dict = None @@ -273,6 +292,8 @@ def __init__(self): def set_logger(self, logger): self._data_logger = logger self._mrmrInst.set_logger(logger) + self._shapInst.set_logger(logger) + self._pcaInst.set_logger(logger) # report_file_prefix is a string used as prefix in all report files of SMLP def set_report_file_prefix(self, report_file_prefix): @@ -908,7 +929,37 @@ def _split_data_for_training(self, X, y, split_test, train_first_n, train_random #print('X_test with y_test > 0.9', X_test.shape); print(X_test.head()) return X_train, y_train, X_test, y_test - + + def _combine_feature_importance(self, shap_importance, mrmr_importance, feature_names, feat_count): + + # Combines SHAP and MRMR feature importance scores using MinMax iteration. + importance_list = [] + + for i, feature in enumerate(feature_names): + # Get absolute MRMR and SHAP values + shap_val = abs(shap_importance[i]) + mrmr_val = abs(mrmr_importance.get(feature, 0)) + + importance_list.append([feature, shap_val, mrmr_val]) + + importance_df = pd.DataFrame(importance_list, columns=['Feature', 'SHAP_Importance', 'MRMR_Importance']) + + # Normalize SHAP and MRMR importance to [0,1] range + scaler = MinMaxScaler() + importance_df[['SHAP_Importance', 'MRMR_Importance']] = scaler.fit_transform( + importance_df[['SHAP_Importance', 'MRMR_Importance']] + ) + + # Compute the final combined importance score + importance_df['Combined_Importance'] = importance_df['SHAP_Importance'] + importance_df['MRMR_Importance'] + + # Sort by importance score and select top features + importance_df = importance_df.sort_values(by='Combined_Importance', ascending=False) + top_features = importance_df['Feature'].head(feat_count).tolist() + print(importance_df) + + return importance_df , top_features + # load data, scale using sklearn MinMaxScaler(), then split into training and test # subsets with ratio given by split_test. Optionally (based on arguments train_first_n, # train_random_n, train_uniform_n), subsample/resample training data using function @@ -922,11 +973,12 @@ def _split_data_for_training(self, X, y, split_test, train_first_n, train_random # Besides training and test subsets, the function returns also the MinMaxScaler object # used for data normalization, to be reused for applying the model to unseen datasets # and also to rescale back the prediction results to the original scale where needed. - def _prepare_data_for_modeling(self, data_file:str, is_training:bool, split_test:float, + def _prepare_data_for_modeling(self, data_file:str, spec_path:str, is_training:bool, split_test:float, feat_names:list[str], resp_names:list[str], keep_feat:list[str], out_prefix:str, train_first_n:int, train_random_n:int, train_uniform_n:int, interactive_plots:bool, - response_plots:bool, mrmr_features_n:int, pos_value:int, neg_value:int, resp_map:str, resp_to_bool:str, - scaler_type:str, scale_features:bool, scale_responses:bool, impute_responses:bool, + response_plots:bool, feat_select_model:str, feat_select_count:int, pca_features:int, + pos_value:int, neg_value:int, resp_map:str, + resp_to_bool:str, scaler_type:str, scale_features:bool, scale_responses:bool, impute_responses:bool, mm_scaler_feat=None, mm_scaler_resp=None, levels_dict=None, model_features_dict=None): data_version_str = 'training' if is_training else 'new' self._data_logger.info('Preparing ' + data_version_str + ' data for modeling: start') @@ -945,7 +997,6 @@ def _prepare_data_for_modeling(self, data_file:str, is_training:bool, split_test if not y is None: assert set(resp_names) == set(y.columns.tolist()) - # TMP !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! start try_correlations = False if try_correlations: # use test 46 @@ -961,14 +1012,64 @@ def _prepare_data_for_modeling(self, data_file:str, is_training:bool, split_test if is_training: #keep_feat = keep_feat + self._specInst.get_spec_constraint_vars(); #print('keep_feat', keep_feat) #print('features before mrmr', feat_names) + pca_equations = None + input_keep_feat = set(ft for ft in keep_feat if ft not in resp_names) for rn in resp_names: - mrmr_feat, _ = self._mrmrInst.smlp_mrmr(X, y[rn], mrmr_features_n); #print('mrmr_feat', mrmr_feat) - model_feat = [ft for ft in feat_names if (ft in mrmr_feat or ft in keep_feat)]; #print(model_feat); - model_features_dict[rn] = model_feat #mrmr_feat + selected_features = set(input_keep_feat) + if feat_select_model not in {'mrmr', 'shap', 'mrmr_shap_ensemble', 'mrmr_shap'}: + raise ValueError(f"Invalid feature selection model '{feat_select_model}'. Allowed values are: 'mrmr', 'shap', 'mrmr_shap_ensemble', 'mrmr_shap'.") + # If mrmr model or mrmr and shap model is called + if (feat_select_model == 'mrmr' or feat_select_model == 'mrmr_shap') and feat_select_count > 0: + mrmr_feat, _ , mrmr_model= self._mrmrInst.smlp_mrmr(X, y[rn], feat_select_count); #print('mrmr_feat', mrmr_feat) + model_feat = [ft for ft in feat_names if (ft in mrmr_feat or ft in keep_feat)]; #print(model_feat); + selected_features.update(model_feat) + # If shap model or mrmr and shap model is called + if (feat_select_model == 'shap' or feat_select_model == 'mrmr_shap') and feat_select_count > 0: + shap_feat, shap_importance = self._shapInst.smlp_shap(X, y[rn], feat_select_count) + shap_feat = [ft for ft in feat_names if (ft in shap_feat or ft in keep_feat)] + selected_features.update(shap_feat) + # If mrmr and shap ensemble model is called + if feat_select_model == 'mrmr_shap_ensemble' and feat_select_count > 0: + mrmr_feat, _ , mrmr_model= self._mrmrInst.smlp_mrmr(X, y[rn], feat_select_count) + shap_feat, shap_importance = self._shapInst.smlp_shap(X, y[rn], feat_select_count) + + model , mrmr_shap_feat = self._combine_feature_importance(shap_importance, mrmr_model, feat_names, feat_select_count) + + self._data_logger.info(f'MRMR_SHAP ensemble selected feature scores for response {rn}:\n{model}') + self._data_logger.info(f'MRMR_SHAP ensemble selected features for response {rn}:{mrmr_shap_feat}') + + mrmr_shap_ensemble_feat = [ft for ft in mrmr_shap_feat if ft in feat_names] + selected_features.update(mrmr_shap_ensemble_feat) + if feat_select_count > 0: + model_features_dict[rn] = sorted(selected_features, key=lambda ft: feat_names.index(ft)) feat_names = [ft for ft in feat_names if ft in lists_union_order_preserving_without_duplicates(list(model_features_dict.values()))] X = X[feat_names] - #print('features after mrmr', feat_names); print('model_features_dict after MRMR', model_features_dict) + # If pca model is called, still in development stage + if pca_features > 0: + max_components = min(pca_features, X.shape[1]) + if pca_features > X.shape[1]: + self._data_logger.warning(f"Requested {pca_features} PCA components, but only {X.shape[1]} features available. Using {max_components} components instead.") + pca_features = X.shape[1] + X_pca, pca_model = self._pcaInst.smlp_pca(X[feat_names], y, pca_features , spec_path) + X_reversed = self._pcaInst.inverse_transform(X_pca, X) + + # This is to compare the efficiency of reversing PCA, can comment this out + self._data_logger.info(f"Comparison between initial input matrix and PCA attempt to reconstruct it, initial matrix: \n {X} \n Reconstructed matrix: \n{X_reversed}") + + # Calculating variance for information about model efficiency + variance = pca_model.explained_variance_ratio_ + total_variance = sum(pca_model.explained_variance_ratio_) + self._data_logger.info(f"PCA Variance Ratio per output: {variance}") + self._data_logger.info(f"Total Variance Captured: {total_variance}") + + pca_equations = self._pcaInst.get_feature_equations(X_pca, X) + X = X_pca + + for rn in resp_names: + model_features_dict[rn] = list(X_pca.columns) # Use PCA-transformed feature names + + feat_names = list(X.columns) # encode levels of categorical features as integers for model training (in feature selection tasks # it is best to use the original categorical features). @@ -994,7 +1095,7 @@ def _prepare_data_for_modeling(self, data_file:str, is_training:bool, split_test X_train, y_train, X_test, y_test = self._split_data_for_training(X, y, split_test, train_first_n, train_random_n, train_uniform_n) res = X, y, X_train, y_train, X_test, y_test, mm_scaler_feat, mm_scaler_resp, \ - feat_names, resp_names, levels_dict, model_features_dict + feat_names, resp_names, pca_equations, levels_dict, model_features_dict else: res = X, y @@ -1003,22 +1104,24 @@ def _prepare_data_for_modeling(self, data_file:str, is_training:bool, split_test # Process data to prepare components required for training models and prediction, and reporting results in # original scale. Supports also prediction and results reporting in origibal scale from saved model - def process_data(self, report_file_prefix:str, data_file:str, new_data_file:str, is_training:bool, split_test, + def process_data(self, report_file_prefix:str, data_file:str, new_data_file:str, spec_path:str, is_training:bool, split_test, feat_names:list[str], resp_names:list[str], keep_feat:list[str], train_first_n:int, train_random_n:int, train_uniform_n:int, interactive_plots:bool, response_plots:bool, - scaler_type:str, scale_features:bool, scale_responses:bool, impute_responses:bool, mrmr_features_n:int, - pos_value, neg_value, resp_map:str, resp_to_bool, save_model:bool, use_model:bool): + scaler_type:str, scale_features:bool, scale_responses:bool, impute_responses:bool, feat_select_model:str, + feat_select_count:int, pca_features_n:int, pos_value, neg_value, + resp_map:str, resp_to_bool, save_model:bool, use_model:bool): #scale = not self._get_data_scaler(scaler_type) is None keep_feat = keep_feat + self._specInst.get_spec_constraint_vars() if data_file is not None: split_test = self._DEF_SPLIT_TEST if split_test is None else split_test X, y, X_train, y_train, X_test, y_test, mm_scaler_feat, mm_scaler_resp, \ - feat_names, resp_names, levels_dict, model_features_dict = self._prepare_data_for_modeling( - data_file, True, split_test, feat_names, resp_names, keep_feat, report_file_prefix, + feat_names, resp_names, pca_equations, levels_dict, model_features_dict = self._prepare_data_for_modeling( + data_file, spec_path, True, split_test, feat_names, resp_names, keep_feat, report_file_prefix, train_first_n, train_random_n, train_uniform_n, interactive_plots, response_plots, - mrmr_features_n, pos_value, neg_value, resp_map, resp_to_bool, scaler_type, - scale_features, scale_responses, impute_responses, None, None, None, None) + feat_select_model, feat_select_count, pca_features_n, pos_value, + neg_value, resp_map, resp_to_bool, scaler_type, scale_features, scale_responses, + impute_responses, None, None, None, None) # santy check that: mm_scaler_feat is not None --> scaler_type != 'none' assert not scaler_type == 'none' or mm_scaler_feat is None @@ -1040,13 +1143,14 @@ def process_data(self, report_file_prefix:str, data_file:str, new_data_file:str, model_features_dict = self._load_model_features(self.model_features_dict_file) feat_names = lists_union_order_preserving_without_duplicates(list(model_features_dict.values())) #print('model_features_dict loaded', model_features_dict); print('feat_names loaded', feat_names) - X, y, X_train, y_train, X_test, y_test = None, None, None, None, None, None + X, y, X_train, y_train, X_test, y_test , pca_equations = None, None, None, None, None, None if new_data_file is not None: X_new, y_new = self._prepare_data_for_modeling( - new_data_file, False, None, feat_names, resp_names, keep_feat, report_file_prefix, None, None, None, - interactive_plots, response_plots, mrmr_features_n, pos_value, neg_value, resp_map, resp_to_bool, scaler_type, - scale_features, scale_responses, impute_responses, mm_scaler_feat, mm_scaler_resp, levels_dict, model_features_dict) + new_data_file, spec_path, False, None, feat_names, resp_names, keep_feat, report_file_prefix, None, None, None, + interactive_plots, response_plots, feat_select_model, feat_select_count, pca_features_n, + pos_value, neg_value, resp_map, resp_to_bool, scaler_type,scale_features, scale_responses, + impute_responses, mm_scaler_feat, mm_scaler_resp, levels_dict, model_features_dict) else: X_new, y_new = None, None @@ -1059,4 +1163,4 @@ def process_data(self, report_file_prefix:str, data_file:str, new_data_file:str, X_new = X_new[common_features] return X, y, X_train, y_train, X_test, y_test, X_new, y_new, mm_scaler_feat, mm_scaler_resp, \ - levels_dict, model_features_dict, feat_names, resp_names + levels_dict, pca_equations, model_features_dict, feat_names, resp_names diff --git a/src/smlp_py/smlp_flows.py b/src/smlp_py/smlp_flows.py index d29f1f96..85a23548 100644 --- a/src/smlp_py/smlp_flows.py +++ b/src/smlp_py/smlp_flows.py @@ -299,13 +299,14 @@ def smlp_flow(self): #self.logger.info('Running SMLP in mode "{}": Start'.format(args.analytics_mode)) self.logger.info('PREPARE DATA FOR MODELING') X, y, X_train, y_train, X_test, y_test, X_new, y_new, mm_scaler_feat, mm_scaler_resp, \ - levels_dict, model_features_dict, feat_names, resp_names = self.dataInst.process_data( - self.configInst.report_file_prefix, self.data_fname, self.new_data_fname, True, args.split_test, + levels_dict, pca_equations, model_features_dict, feat_names, resp_names = self.dataInst.process_data( + self.configInst.report_file_prefix, self.data_fname, self.new_data_fname, args.spec, True, args.split_test, feat_names, resp_names, args.keep_features, args.train_first_n, args.train_random_n, args.train_uniform_n, args.interactive_plots, args.response_plots, args.data_scaler, - args.scale_features, args.scale_responses, args.impute_responses, args.mrmr_feat_count_for_prediction, + args.scale_features, args.scale_responses, args.impute_responses, args.feature_selection_model, + args.feature_selection_count, args.pca_feat_count_for_prediction, args.positive_value, args.negative_value, args.response_map, args.response_to_bool, args.save_model, args.use_model) - + # sanity check that the order of features in model_features_dict, feat_names, X_train, X_test, X is # the same; this is mostly important for model exploration modes self.modelInst.model_features_sanity_check(model_features_dict, feat_names, X_train, X_test, X) @@ -388,7 +389,7 @@ def smlp_flow(self): elif args.analytics_mode == 'optimize': self.optInst.smlp_optimize(syst_expr_dict, args.model, model, self.dataInst.unscaled_training_features, self.dataInst.unscaled_training_responses, - model_features_dict, feat_names, resp_names, objv_names, objv_exprs, args.optimize_pareto, + model_features_dict, pca_equations, feat_names, resp_names, objv_names, objv_exprs, args.optimize_pareto, args.optimization_strategy, quer_names, quer_exprs, delta_dict, args.epsilon, alpha_global_expr, beta_expr, args.eta, theta_radii_dict, args.solver_logic, args.vacuity_check, diff --git a/src/smlp_py/smlp_mrmr.py b/src/smlp_py/smlp_mrmr.py index 2ed66326..3a7ca811 100644 --- a/src/smlp_py/smlp_mrmr.py +++ b/src/smlp_py/smlp_mrmr.py @@ -11,9 +11,6 @@ def __init__(self): self._MRMR_FEATURES_PRED = 15 self._MRMR_FEATURES_CORR = 15 self.mrmr_params_dict = { - 'mrmr_feat_count_for_prediction': {'abbr':'mrmr_pred', 'default':self._MRMR_FEATURES_PRED, 'type':int, - 'help':'Count of features selected by MRMR algorithm for predictive models ' + - '[default: {}]'.format(str(self._MRMR_FEATURES_PRED))}, 'mrmr_feat_count_for_correlation': {'abbr':'mrmr_corr', 'default':self._MRMR_FEATURES_CORR, 'type':int, 'help':'Count of features selected by MRMR algorithm for correlation analysis ' + '[default: {}]'.format(str(self._MRMR_FEATURES_CORR))} @@ -63,7 +60,7 @@ def _mrmr_regres(self, X:pd.DataFrame, y:pd.Series, K:int, relevance='f', redund str(y.name) + ' :\n'+ str(mrmr_scores_df)) self._mrmr_logger.info('MRMR feature selection for response ' + y.name + ' : end') - return mrmr_res[0], mrmr_scores_df + return mrmr_res[0], mrmr_scores_df, mrmr_res[1] # mrmr feature selection using mrmr-feature package, where y is a categorical variable (pandas.Series) # TODO !!!: not tested @@ -87,7 +84,7 @@ def _mrmr_class(self, X:pd.DataFrame, y:pd.Series, K:int, relevance='f', y.name + ' :\n'+ str(mrmr_scores_df)) self._mrmr_logger.info('MRMR feature selection for response ' + y.name + ' : end') - return mrmr_res[0], mrmr_scores_df + return mrmr_res[0], mrmr_scores_df, mrmr_res[1] def smlp_mrmr(self, X:pd.DataFrame, y:pd.Series, #resp_type:str, #"numeric", feat_cnt:int): diff --git a/src/smlp_py/smlp_optimize.py b/src/smlp_py/smlp_optimize.py index 4ed0a29c..b8a03093 100644 --- a/src/smlp_py/smlp_optimize.py +++ b/src/smlp_py/smlp_optimize.py @@ -1096,8 +1096,8 @@ def check_synthesis_feasibility(self, feasibility:bool, objv_names:list[str], ob # function (and to smlp_optsyn() instead of passing X,y; The bounds on objectives are not strictly necessary, # any approximation may be used, but accurate approximation might reduce iterations count needed for # computing optimal confoguurations (in optimize and optsyn modes) - def smlp_optimize(self, syst_expr_dict:dict, algo:str, model:dict, X:pd.DataFrame, y:pd.DataFrame, model_features_dict:dict, - feat_names:list[str], resp_names:list[str], + def smlp_optimize(self, syst_expr_dict:dict, algo:str, model:dict, X:pd.DataFrame, y:pd.DataFrame, model_features_dict:dict, + pca_equations:dict, feat_names:list[str], resp_names:list[str], objv_names:list[str], objv_exprs, pareto:bool, strategy:str, #asrt_names:list[str], asrt_exprs, quer_names:list[str], quer_exprs, delta:float, epsilon:float, alph_expr:str, beta_expr:str, eta_expr:str, theta_radii_dict:dict, solver_logic:str, vacuity:bool, @@ -1115,7 +1115,7 @@ def smlp_optimize(self, syst_expr_dict:dict, algo:str, model:dict, X:pd.DataFram domain, syst_term_dict, model_full_term_dict, eta, alpha, beta, interface_consistent, model_consistent = \ self._modelTermsInst.create_model_exploration_base_components( - syst_expr_dict, algo, model, model_features_dict, feat_names, resp_names, + syst_expr_dict, algo, model, model_features_dict, pca_equations, feat_names, resp_names, #delta, epsilon, #objv_names, objv_exprs, None, None, None, None, alph_expr, beta_expr, eta_expr, data_scaler, scale_feat, scale_resp, #scale_objv, float_approx, float_precision, data_bounds_json_path) diff --git a/src/smlp_py/smlp_pca.py b/src/smlp_py/smlp_pca.py new file mode 100644 index 00000000..89ccaa4f --- /dev/null +++ b/src/smlp_py/smlp_pca.py @@ -0,0 +1,225 @@ +# SPDX-License-Identifier: Apache-2.0 +# This file is part of smlp. + +import numpy as np +import pandas as pd +import os +from sklearn.decomposition import PCA +import json +#from sklearn.preprocessing import StandardScaler + +class SmlpPCA: + def __init__(self): + self._pca_logger = None + self.pca_model = None + self._DEF_PCA_FEATURES_PRED = 0 + self.pca_params_dict = { + 'pca_feat_count_for_prediction': {'abbr':'pca_pred', 'default':self._DEF_PCA_FEATURES_PRED, 'type':int, + 'help':'Count of features selected by pca algorithm ' + + '[default: {}]'.format(str(self._DEF_PCA_FEATURES_PRED))}, + } + def set_logger(self, logger): + self._pca_logger = logger + + + + def create_pca_based_spec(self, X: pd.DataFrame, y: pd.DataFrame, original_spec_path: str): + if self.pca_model is None: + raise ValueError("PCA model has not been trained. Call smlp_pca() first.") + + self._pca_logger.info('Generating PCA-based spec file: start') + + # Read original spec file + with open(original_spec_path, "r") as f: + raw_text = f.read().replace('\t', ' ') + spec_data = json.loads(raw_text) + + # Prepare output directory + base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) + output_dir = os.path.join(base_dir, "regr_smlp", "models") + os.makedirs(output_dir, exist_ok=True) + + # Collect original knob ranges + knob_ranges = {} + original_variables = spec_data.get('variables', []) + for item in original_variables: + if item.get('interface') in ['knob', 'input']: + knob_ranges[item['label']] = item['range'] + + # PCA components and feature information + components = self.pca_model.components_ + feature_means = self.pca_model.mean_ + feature_names = X.columns.tolist() + pca_knobs = [] + pca_equations_log = ["PCA Component Equations:"] + # Create PCA knobs with corrected ranges and save equations + for i, component in enumerate(components): + pc_name = f"PC{i+1}" + + min_pc, max_pc = 0, 0 + offset = 0.0 + terms = [] + + for weight, feature in zip(component, feature_names): + if feature not in knob_ranges: + continue + feature_min, feature_max = knob_ranges[feature] + if weight >= 0: + min_pc += weight * feature_min + max_pc += weight * feature_max + else: + min_pc += weight * feature_max + max_pc += weight * feature_min + + # Accumulate the offset for mean centering + feature_idx = feature_names.index(feature) + + offset -= weight * feature_means[feature_idx] + + # Build equation term + if abs(weight) > 1e-6: + terms.append(f"{weight:.4f} * {feature}") + + # Final corrected min/max with offset + estimated_min = min(min_pc, max_pc) + offset + estimated_max = max(min_pc, max_pc) + offset + estimated_range = [round(estimated_min, 4), round(estimated_max, 4)] + + # Create knob + pca_knobs.append({ + "label": pc_name, + "interface": "knob", + "type": "real", + "range": estimated_range, + "rad-rel": 0.05 + }) + + # Create readable equation for PCA component + equation = " + ".join(terms) + equation += f" + {offset:.4f}" + pca_equations_log.append(f"{pc_name} ≈ {equation}") + + # Keep outputs unchanged + output_variables = [] + for item in original_variables: + if item.get('interface') == 'output': + output_variables.append(item) + + # Combine into new spec + new_variables = pca_knobs + output_variables + new_spec = { + "version": spec_data.get("version", "1.2"), + "variables": new_variables, + "objectives": spec_data.get("objectives", {}) + } + + # Save new PCA spec + output_spec_path = os.path.join(output_dir, "pca_generated.spec") + with open(output_spec_path, "w") as f: + json.dump(new_spec, f, indent=4) + + self._pca_logger.info(f"PCA-based spec saved to {output_spec_path}") + + # Save PCA component equations + pca_equations_path = os.path.join(output_dir, "pca_component_equations.txt") + with open(pca_equations_path, "w") as f: + f.write("\n".join(pca_equations_log)) + + self._pca_logger.info(f"PCA component equations saved to {pca_equations_path}") + self._pca_logger.info('Generating PCA-based spec file: end') + + return "OK" + + def smlp_pca(self, X: pd.DataFrame, y: pd.DataFrame, feat_count: int , spec_path: str): + + if X.shape[1] == 0: + return X , None + + self._pca_logger.info('PCA feature compression: start') + + self.pca_model = PCA(n_components=feat_count) + X_pca = self.pca_model.fit_transform(X) + + pca_columns = [f'PC{i+1}' for i in range(feat_count)] + X_pca_df = pd.DataFrame(X_pca, index=X.index, columns=pca_columns) + + self._pca_logger.info(f'PCA applied: Reduced {X.shape[1]} features to {feat_count} components') + self._pca_logger.info('PCA feature compression: end') + + # Go back two levels from smlp_py to project root + base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) + + # Target output directory + output_dir = os.path.join(base_dir, "regr_smlp", "models") + os.makedirs(output_dir, exist_ok=True) + + # Save PCA components + pca_components_path = os.path.join(output_dir, "pca_components.csv") + X_pca_df.to_csv(pca_components_path, index=False) + self._pca_logger.info(f"PCA components saved to {pca_components_path}") + + # Save combined PCA components and outputs + X_pca_and_outputs = pd.concat([X_pca_df, y.loc[X_pca_df.index]], axis=1) + pca_full_data_path = os.path.join(output_dir, "pca_full_data.csv") + X_pca_and_outputs.to_csv(pca_full_data_path, index=False) + self._pca_logger.info(f"PCA components with outputs saved to {pca_full_data_path}") + + return X_pca_df, self.pca_model + + def inverse_transform(self, X_pca , X): + # Converts PCA features back to original feature space using stored PCA model. + + if self.pca_model is None: + raise ValueError("PCA model has not been trained. Call smlp_pca() first.") + + X_reconstructed = pd.DataFrame(self.pca_model.inverse_transform(X_pca), index=X_pca.index) + X_reconstructed.columns = list(X.columns) + + return X_reconstructed + + def get_feature_equations(self, X_pca, X): + # Generate a dictionary mapping each original feature to its equation in terms of PCA-transformed components . + + if self.pca_model is None: + raise ValueError("PCA model has not been trained. Call smlp_pca() first.") + W = self.pca_model.components_ + W_inv = np.linalg.pinv(W) + + # Get the mean values of original features + feature_means = self.pca_model.mean_ + feature_equations = {} + equations_log = ["PCA Equations for Original Features:"] + + for i, feature_name in enumerate(X.columns): + coefficients = W_inv[i, :] + + # Create equation for each feature + feature_index = X.columns.get_loc(feature_name) + equation = " + ".join(f"{coeff:.4f} * {pc}" for coeff, pc in zip(coefficients, X_pca.columns)) + equation += f" + {feature_means[feature_index]:.4f}" + equation_terms = {pc: coeff for pc, coeff in zip(X_pca.columns, coefficients)} + + # Store each output equation + feature_equations[feature_name] = { + "terms": equation_terms, + "offset": feature_means[i] + } + + equations_log.append(f"{feature_name} ≈ {equation}") + # Logs all equations, can be commented out if not relevant + self._pca_logger.info("\n".join(equations_log)) + + base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) + + # Target output directory + output_dir = os.path.join(base_dir, "regr_smlp", "models") + os.makedirs(output_dir, exist_ok=True) + # Save PCA feature equations + pca_equations_path = os.path.join(output_dir, "pca_equations.txt") + with open(pca_equations_path, "w") as f: + f.write("\n".join(equations_log)) + self._pca_logger.info(f"PCA feature equations saved to {pca_equations_path}") + + + return feature_equations + diff --git a/src/smlp_py/smlp_shap.py b/src/smlp_py/smlp_shap.py new file mode 100644 index 00000000..5eee0852 --- /dev/null +++ b/src/smlp_py/smlp_shap.py @@ -0,0 +1,68 @@ +import shap +import pandas as pd +import numpy as np +from sklearn.ensemble import RandomForestRegressor +from sklearn.metrics import r2_score + +class SmlpShap: + def __init__(self): + self._shap_logger = None + self._SHAP_FEATURES_PRED = 15 + self._DEF_ESTIMATOR_NUMBER = 100 + + def set_logger(self, logger): + self._shap_logger = logger + + def smlp_shap(self, X: pd.DataFrame, y: pd.Series, feat_cnt: int): + if feat_cnt == 0 or X.shape[1] == 0: + return [], np.zeros(X.shape[1]) + self._shap_logger.info(f'SHAP feature selection for response {y.name}: start') + + + # Code added in order to reduce training time + X_sample = X.sample(n=min(1000, len(X)), random_state=42) + y_sample = y.loc[X_sample.index] + + # Train the model + model = RandomForestRegressor(n_estimators=self._DEF_ESTIMATOR_NUMBER, n_jobs=-1) + model.fit(X_sample, y_sample) + + r2 = r2_score(y_sample, model.predict(X_sample)) + if r2 < 0.01: + self._shap_logger.warning(f"Model R² score for response {y.name} is very low (R² = {r2:.5f}); SHAP results may be uninformative.") + return [], np.zeros(X_sample.shape[1]) + + + # Option 1: KernelExplainer (model independent, very slow) + # This is extremely slow for large datasets. + # explainer = shap.KernelExplainer(model.predict, X_sample) + + # Option 2: API Explainer (automatically selects an explainer type, slow for tree models) + # explainer = shap.Explainer(model.predict, X_sample, feature_perturbation="interventional") + + # Option 3 (recommended): TreeExplainer (fast and gives similar results to Kernel and API explainers in testing) + explainer = shap.TreeExplainer(model) + + # Compute mean absolute SHAP value per feature + shap_values = explainer.shap_values(X_sample) + if isinstance(shap_values, list): + shap_importance = np.abs(np.array(shap_values)).mean(axis=(0, 1)) + else: + shap_importance = np.abs(shap_values).mean(axis=0) + + if np.all(shap_importance == 0): + self._shap_logger.warning(f"All SHAP values are 0 for response {y.name}; skipping SHAP feature selection.") + return [], shap_importance + + # Rank features by SHAP importance in descending order + feature_importance = pd.DataFrame({ + 'Feature': X_sample.columns, + 'SHAP_Importance': shap_importance + }).sort_values(by='SHAP_Importance', ascending=False) + + selected_features = feature_importance['Feature'].iloc[:feat_cnt].tolist() + + self._shap_logger.info(f'SHAP selected feature scores for response {y.name}:{feature_importance}') + self._shap_logger.info(f'SHAP feature selection for response {y.name}: end') + + return selected_features , shap_importance \ No newline at end of file diff --git a/src/smlp_py/smlp_terms.py b/src/smlp_py/smlp_terms.py index ab0a6ddf..435390ab 100644 --- a/src/smlp_py/smlp_terms.py +++ b/src/smlp_py/smlp_terms.py @@ -2051,7 +2051,7 @@ def _compute_model_terms_dict(self, algo, model, feat_names, resp_names, data_bo def compute_models_terms_dict(self, algo, model_or_model_dict, model_features_dict, feat_names, resp_names, data_bounds, data_scaler,scale_features, scale_responses): #print('model_features_dict', model_features_dict); print('feat_names', feat_names, 'resp_names', resp_names, flush=True) - assert lists_union_order_preserving_without_duplicates(list(model_features_dict.values())) == feat_names + assert set(lists_union_order_preserving_without_duplicates(list(model_features_dict.values()))) == set(feat_names) #print('model_or_model_dict', model_or_model_dict) if isinstance(model_or_model_dict, dict): models_full_terms_dict = {} @@ -2355,7 +2355,7 @@ def var_domain(self, var, spec_domain_dict): return var_component # this function builds terms and formulas for constraints, system description and the models - def create_model_exploration_base_components(self, syst_expr_dict:dict, algo, model, model_features_dict:dict, feat_names:list, resp_names:list, + def create_model_exploration_base_components(self, syst_expr_dict:dict, algo, model, model_features_dict:dict, pca_equations:dict, feat_names:list, resp_names:list, alph_expr:str, beta_expr:str, eta_expr:str, data_scaler, scale_feat, scale_resp, float_approx=True, float_precision=64, data_bounds_json_path=None): self._smlp_terms_logger.info('Creating model exploration base components: Start') @@ -2373,14 +2373,26 @@ def create_model_exploration_base_components(self, syst_expr_dict:dict, algo, mo # contraints on features used as control variables and on the responses alph_ranges = self.compute_input_ranges_formula_alpha_eta('alpha', feat_names); #print('alph_ranges') - alph_global = self.compute_global_alpha_formula(alph_expr, feat_names); #print('alph_global') - alpha = self.smlp_and(alph_ranges, alph_global); #print('alpha') - beta = self.compute_beta_formula(beta_expr, feat_names+resp_names); #print('beta') - eta_ranges = self.compute_input_ranges_formula_alpha_eta('eta', feat_names); #print('eta_ranges') - eta_grids = self.compute_grid_range_formulae_eta(feat_names); #print('eta_grids') - eta_global = self.compute_eta_formula(eta_expr, feat_names); #print('eta_global', eta_global) - eta = self.smlp_and_multi([eta_ranges, eta_grids, eta_global]); #print('eta', eta) - + + if pca_equations is None: + alph_global = self.compute_global_alpha_formula(alph_expr, feat_names); #print('alph_global') + alpha = self.smlp_and(alph_ranges, alph_global); #print('alpha') + beta = self.compute_beta_formula(beta_expr, feat_names+resp_names); #print('beta') + eta_ranges = self.compute_input_ranges_formula_alpha_eta('eta', feat_names); #print('eta_ranges') + eta_grids = self.compute_grid_range_formulae_eta(feat_names); #print('eta_grids') + eta_global = self.compute_eta_formula(eta_expr, feat_names); #print('eta_global', eta_global) + eta = self.smlp_and_multi([eta_ranges, eta_grids, eta_global]); #print('eta', eta) + else: + alph_global = True + alpha = True + beta = True + eta_ranges = True + eta_grids = True + eta_global = True + eta = True + if pca_equations is not None: + self._smlp_terms_logger.warn('PCA is currently under development, so alpha, beta and eta constraints will be ignored.') + self._smlp_terms_logger.info('Alpha global constraints: ' + str(alph_global)) self._smlp_terms_logger.info('Alpha ranges constraints: ' + str(alph_ranges)) self._smlp_terms_logger.info('Alpha combined constraints: ' + str(alpha)) diff --git a/src/smlp_py/train_sklearn.py b/src/smlp_py/train_sklearn.py index edbf7b59..080830fb 100644 --- a/src/smlp_py/train_sklearn.py +++ b/src/smlp_py/train_sklearn.py @@ -465,6 +465,8 @@ def sklearn_main(self, get_model_file_prefix, feat_names_dict, resp_names, algo, else: #union_feat_names = list(set(sum(list(feat_names_dict.values()), []))) - the ordering is affected union_feat_names = lists_union_order_preserving_without_duplicates(list(feat_names_dict.values())) + # Added check + union_feat_names = sorted(union_feat_names, key=lambda x: X_train.columns.tolist().index(x)) model = self._sklearn_train_multi_response(get_model_file_prefix, union_feat_names, resp_names, algo, X_train, X_test, y_train, y_test, hparam_dict, interactive_plots, seed, sample_weights_vect) diff --git a/src/smlp_train.py b/src/smlp_train.py index ba668708..ae22a4b5 100755 --- a/src/smlp_train.py +++ b/src/smlp_train.py @@ -112,10 +112,10 @@ def main(argv): logger.info('PREPARE DATA FOR MODELING') X, y, X_train, y_train, X_test, y_test, X_new, y_new, mm_scaler_feat, mm_scaler_resp, \ levels_dict, model_features_dict = dataInst.process_data( - inst, inst.data_fname, inst.new_data_fname, True, args.split_test, feat_names, resp_names, + inst, inst.data_fname, inst.new_data_fname, args.spec, True, args.split_test, feat_names, resp_names, args.train_first_n, args.train_random_n, args.train_uniform_n, args.interactive_plots, - args.data_scaler, args.mrmr_feat_count_for_prediction, - args.save_model, args.use_model) + args.data_scaler, args.feature_selection_model, args.feature_selection_count, + args.pca_feat_count_for_prediction, args.save_model, args.use_model) # model training, validation, testing, prediction on training and labeled data as well as new data when available. model = modelInst.build_models(inst, args.model, X, y, X_train, y_train, X_test, y_test, X_new, y_new,