From f8157e5618bc7d4decc9bea4248321cbe270dbb6 Mon Sep 17 00:00:00 2001
From: Mihai Stere <mihai.stere@student.manchester.ac.uk>
Date: Thu, 26 Jun 2025 00:52:50 +0300
Subject: [PATCH] Mihai Stere feature compression changes

---
 README.md                    |  12 +-
 src/smlp_py/smlp_data.py     | 154 ++++++++++++++++++++----
 src/smlp_py/smlp_flows.py    |  11 +-
 src/smlp_py/smlp_mrmr.py     |   7 +-
 src/smlp_py/smlp_optimize.py |   6 +-
 src/smlp_py/smlp_pca.py      | 225 +++++++++++++++++++++++++++++++++++
 src/smlp_py/smlp_shap.py     |  68 +++++++++++
 src/smlp_py/smlp_terms.py    |  32 +++--
 src/smlp_py/train_sklearn.py |   2 +
 src/smlp_train.py            |   6 +-
 10 files changed, 467 insertions(+), 56 deletions(-)
 create mode 100644 src/smlp_py/smlp_pca.py
 create mode 100644 src/smlp_py/smlp_shap.py

diff --git a/README.md b/README.md
index 1bd0d4d5..27fdaa43 100644
--- a/README.md
+++ b/README.md
@@ -53,13 +53,13 @@ instruction for the installation on Ubuntu.
 
     # first the tool itself
     ../../src/run_smlp.py -data "../data/smlp_toy_num_resp_mult" \
-    -out_dir ./ -pref Test83 -mode optimize -pareto t -sat_thresh f \
+    -out_dir ./ -pref Test83 -mode optimize -pareto t \
     -resp y1,y2 -feat x,p1,p2 -model dt_sklearn -dt_sklearn_max_depth 15 \
     -spec smlp_toy_num_resp_mult_free_inps -data_scaler min_max \
     -beta "y1>7 and y2>6" -objv_names obj1,objv2,objv3 \
     -objv_exprs "(y1+y2)/2;y1/2-y2;y2" -epsilon 0.05 -delta_rel 0.01 \
-    -save_model_config f -mrmr_pred 0 -plots f -seed 10 -log_time f \
-    -spec ../specs/smlp_toy_num_resp_mult_free_inps
+    -save_model_config f -plots f -seed 10 -log_time f \
+    -spec ../specs/smlp_toy_num_resp_mult_free_inps.spec
 
     # then the regression script
     ./smlp_regr.py -w 1 -def n -t 88 -tol 5
@@ -98,8 +98,8 @@ will produce the SMLP command for the regression test number 1:
 
 	../../src/run_smlp.py -data "../data/smlp_toy_num_resp_mult" \
 	-out_dir ./ -pref Test1 -mode train -resp y1 -feat x,p1,p2 \
-	-model dt_caret -save_model_config f -mrmr_pred 0 -plots f \
-	-seed 10 -log_time f
+	-model dt_caret -save_model_config f -feat_select_model mrmr \
+	-feat_select_count 0 -plots f -seed 10 -log_time f
 
 For details about those parameters, please refer to the help messages (-h) of
 both tools, src/run_smlp.py and regr_smlp/code/smlp_regr.py, as well as the
@@ -159,12 +159,14 @@ directory, run the following commands:
     -mode optimize -pareto f -sat_thresh f -resp o0 -feat \
     Byte,CH,RANK,Timing,i0,i1,i2,i3 -model dt_sklearn -dt_sklearn_max_depth 15 \
     -data_scaler min_max -epsilon 0.05 -log_time f -plots f \
+	-feat_select_model mrmr -feat_select_count 15 \
     -spec ../specs/smlp_s2_tx
 
     ../../src/run_smlp.py -out_dir ./ -pref smlp_s2_tx_nn -data ../data/smlp_s2_tx \
     -mode optimize -pareto f -sat_thresh f -resp o0 \
     -feat Byte,CH,RANK,Timing,i0,i1,i2,i3 \
     -model nn_keras -nn_keras_epochs 20 -data_scaler min_max \
+	-feat_select_model mrmr -feat_select_count 15 \
     -epsilon 0.05 -log_time f -plots f  -spec ../specs/smlp_s2_tx 
 
 These runs will take longer than the regression tests provided earlier,
diff --git a/src/smlp_py/smlp_data.py b/src/smlp_py/smlp_data.py
index eb5d6cb1..309819d7 100644
--- a/src/smlp_py/smlp_data.py
+++ b/src/smlp_py/smlp_data.py
@@ -18,6 +18,9 @@
 from smlp_py.smlp_utils import (np_JSONEncoder, list_intersection, str_to_bool, list_unique_unordered,
     lists_union_order_preserving_without_duplicates, get_response_type, cast_type, pd_df_col_is_numeric)
 from smlp_py.smlp_mrmr import SmlpMrmr
+from smlp_py.smlp_shap import SmlpShap
+from smlp_py.smlp_pca import SmlpPCA
+#from smlp_py.smlp_spec import SmlpSpec
 #from smlp_py.smlp_spec import SmlpSpec
 from smlp_py.smlp_constants import *
 from smlp_py.smlp_discretize import SmlpDiscretize
@@ -35,8 +38,10 @@ def __init__(self):
         # and in that case we might need to instantiate MrmrFeatures at a top level script (like smslp_train.py)
         # along with instantiating DataCommon (and maybe ModelsCommon).
         self._mrmrInst = SmlpMrmr() 
+        self._shapInst = SmlpShap()
+        self._pcaInst = SmlpPCA()  
         self._specInst = None # SmlpSpec()
-        
+
         # default values of parameters related to dataset; used to generate args.
         self._DEF_TRAIN_FIRST = 0 # subseting first_n rows from training data
         self._DEF_TRAIN_RAND = 0  # sampling random_n rows from training data
@@ -65,6 +70,10 @@ def __init__(self):
         
         # features that must be used in training models (assuming they are within initially defined input features)
         self._DEF_KEEP_FEATURES = []
+
+        # SMLP feature selection default parameters on model used and feature counts (off by default)
+        self._DEF_FEATURE_SELECTION = 'mrmr'
+        self._DEF_FEATURE_COUNT = 0
         
         # Dictionary with features (names) that have a missing values as dictionary keys and row indices of the 
         # missing values as dictionary values. It is computed just before imputing missing values in features. 
@@ -264,8 +273,18 @@ def __init__(self):
             'response_plots': {'abbr':'resp_plots', 'default': self._DEF_RESPONSE_PLOTS, 'type':str_to_bool,
                 'help': 'Should response value distribution plots be genrated during data processing? ' +
                     'A related option interactive_plots controls whether the generated plots should be ' +
-                    'displayed interactively during runtime [default: ' + str(self._DEF_RESPONSE_PLOTS) + ']'}
-            } | self._mrmrInst.mrmr_params_dict
+                    'displayed interactively during runtime [default: ' + str(self._DEF_RESPONSE_PLOTS) + ']'},
+            'feature_selection_model': {'abbr':'feat_select_model', 'default': self._DEF_FEATURE_SELECTION, 'type':str,
+                'help': 'What feature selection model do you want to use? ' +
+                    'Can choose between mrmr, shap, mrmr_shap or mrmr_shap_ensemble meaning the model can ' +
+                    'use just mrmr, just shap, mrmr then shap, or both scored combined' + 
+                    ' in order to select features [default: ' + str(self._DEF_FEATURE_SELECTION) + ']'},
+            'feature_selection_count': {'abbr':'feat_select_count', 'default': self._DEF_FEATURE_COUNT, 'type':int,
+                'help': 'The amount of features the feature selection model should take into account. ' +
+                    'If mrmr_shap is chosen as the model, then the number of features will first be chosen by mrmr, then by shap ' +
+                    'and then listed together, and if mrmr_shap_ensemble is chosen, then the importance values will be combined ' +
+                    'and then features are chosen [default: ' + str(self._DEF_FEATURE_COUNT) + ']'}
+            } | self._pcaInst.pca_params_dict
         self.data_bounds_dict = None
         
     
@@ -273,6 +292,8 @@ def __init__(self):
     def set_logger(self, logger):
         self._data_logger = logger 
         self._mrmrInst.set_logger(logger)
+        self._shapInst.set_logger(logger)
+        self._pcaInst.set_logger(logger)
     
     # report_file_prefix is a string used as prefix in all report files of SMLP
     def set_report_file_prefix(self, report_file_prefix):
@@ -908,7 +929,37 @@ def _split_data_for_training(self, X, y, split_test, train_first_n, train_random
             #print('X_test with y_test > 0.9', X_test.shape); print(X_test.head())
 
         return X_train, y_train, X_test, y_test
-    
+
+    def _combine_feature_importance(self, shap_importance, mrmr_importance, feature_names, feat_count):
+        
+        # Combines SHAP and MRMR feature importance scores using MinMax iteration.
+        importance_list = []
+
+        for i, feature in enumerate(feature_names):
+            # Get absolute MRMR and SHAP values
+            shap_val = abs(shap_importance[i])  
+            mrmr_val = abs(mrmr_importance.get(feature, 0))
+
+            importance_list.append([feature, shap_val, mrmr_val])
+
+        importance_df = pd.DataFrame(importance_list, columns=['Feature', 'SHAP_Importance', 'MRMR_Importance'])
+
+        # Normalize SHAP and MRMR importance to [0,1] range
+        scaler = MinMaxScaler()
+        importance_df[['SHAP_Importance', 'MRMR_Importance']] = scaler.fit_transform(
+            importance_df[['SHAP_Importance', 'MRMR_Importance']]
+        )
+
+        # Compute the final combined importance score
+        importance_df['Combined_Importance'] = importance_df['SHAP_Importance'] + importance_df['MRMR_Importance']
+
+        # Sort by importance score and select top features
+        importance_df = importance_df.sort_values(by='Combined_Importance', ascending=False)
+        top_features = importance_df['Feature'].head(feat_count).tolist()
+        print(importance_df)
+
+        return importance_df , top_features
+
     # load data, scale using sklearn MinMaxScaler(), then split into training and test 
     # subsets with ratio given by split_test. Optionally (based on arguments train_first_n, 
     # train_random_n, train_uniform_n), subsample/resample training data using function
@@ -922,11 +973,12 @@ def _split_data_for_training(self, X, y, split_test, train_first_n, train_random
     # Besides training and test subsets, the function returns also the MinMaxScaler object 
     # used for data normalization, to be reused for applying the model to unseen datasets
     # and also to rescale back the prediction results to the original scale where needed.
-    def _prepare_data_for_modeling(self, data_file:str, is_training:bool, split_test:float, 
+    def _prepare_data_for_modeling(self, data_file:str, spec_path:str, is_training:bool, split_test:float, 
             feat_names:list[str], resp_names:list[str], keep_feat:list[str], out_prefix:str, 
             train_first_n:int, train_random_n:int, train_uniform_n:int, interactive_plots:bool, 
-            response_plots:bool, mrmr_features_n:int, pos_value:int, neg_value:int, resp_map:str, resp_to_bool:str, 
-            scaler_type:str, scale_features:bool, scale_responses:bool, impute_responses:bool, 
+            response_plots:bool, feat_select_model:str, feat_select_count:int, pca_features:int,
+            pos_value:int, neg_value:int, resp_map:str,
+            resp_to_bool:str, scaler_type:str, scale_features:bool, scale_responses:bool, impute_responses:bool, 
             mm_scaler_feat=None, mm_scaler_resp=None, levels_dict=None, model_features_dict=None):
         data_version_str = 'training' if is_training else 'new'
         self._data_logger.info('Preparing ' + data_version_str + ' data for modeling: start')
@@ -945,7 +997,6 @@ def _prepare_data_for_modeling(self, data_file:str, is_training:bool, split_test
         if not y is None:
             assert set(resp_names) == set(y.columns.tolist())
         
-        
         # TMP !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! start
         try_correlations = False
         if try_correlations: # use test 46
@@ -961,14 +1012,64 @@ def _prepare_data_for_modeling(self, data_file:str, is_training:bool, split_test
         if is_training:
             #keep_feat = keep_feat + self._specInst.get_spec_constraint_vars(); #print('keep_feat', keep_feat)
             #print('features before mrmr', feat_names)
+            pca_equations = None
+            input_keep_feat = set(ft for ft in keep_feat if ft not in resp_names) 
             for rn in resp_names:
-                mrmr_feat, _ = self._mrmrInst.smlp_mrmr(X, y[rn], mrmr_features_n); #print('mrmr_feat', mrmr_feat)
-                model_feat = [ft for ft in feat_names if (ft in mrmr_feat or ft in keep_feat)]; #print(model_feat); 
-                model_features_dict[rn] = model_feat #mrmr_feat
+                selected_features = set(input_keep_feat)
+                if feat_select_model not in {'mrmr', 'shap', 'mrmr_shap_ensemble', 'mrmr_shap'}:
+                    raise ValueError(f"Invalid feature selection model '{feat_select_model}'. Allowed values are: 'mrmr', 'shap', 'mrmr_shap_ensemble', 'mrmr_shap'.")
+                # If mrmr model or mrmr and shap model is called
+                if (feat_select_model == 'mrmr' or feat_select_model == 'mrmr_shap') and feat_select_count > 0:
+                    mrmr_feat, _ , mrmr_model= self._mrmrInst.smlp_mrmr(X, y[rn], feat_select_count); #print('mrmr_feat', mrmr_feat)
+                    model_feat = [ft for ft in feat_names if (ft in mrmr_feat or ft in keep_feat)]; #print(model_feat); 
+                    selected_features.update(model_feat)
+                # If shap model or mrmr and shap model is called
+                if (feat_select_model == 'shap' or feat_select_model == 'mrmr_shap') and feat_select_count > 0:
+                    shap_feat, shap_importance = self._shapInst.smlp_shap(X, y[rn], feat_select_count)
+                    shap_feat = [ft for ft in feat_names if (ft in shap_feat or ft in keep_feat)]
+                    selected_features.update(shap_feat)
+                # If mrmr and shap ensemble model is called
+                if feat_select_model == 'mrmr_shap_ensemble' and feat_select_count > 0:
+                    mrmr_feat, _ , mrmr_model= self._mrmrInst.smlp_mrmr(X, y[rn], feat_select_count)
+                    shap_feat, shap_importance = self._shapInst.smlp_shap(X, y[rn], feat_select_count)
+
+                    model , mrmr_shap_feat = self._combine_feature_importance(shap_importance, mrmr_model, feat_names, feat_select_count)
+
+                    self._data_logger.info(f'MRMR_SHAP ensemble selected feature scores for response {rn}:\n{model}')
+                    self._data_logger.info(f'MRMR_SHAP ensemble selected features for response {rn}:{mrmr_shap_feat}')
+
+                    mrmr_shap_ensemble_feat = [ft for ft in mrmr_shap_feat if ft in feat_names]
+                    selected_features.update(mrmr_shap_ensemble_feat)
+                if feat_select_count > 0:
+                    model_features_dict[rn] = sorted(selected_features, key=lambda ft: feat_names.index(ft))
             feat_names = [ft for ft in feat_names if ft in 
                 lists_union_order_preserving_without_duplicates(list(model_features_dict.values()))]
             X = X[feat_names]
-            #print('features after mrmr', feat_names); print('model_features_dict after MRMR', model_features_dict)
+            # If pca model is called, still in development stage
+            if pca_features > 0:
+                max_components = min(pca_features, X.shape[1])
+                if pca_features > X.shape[1]:
+                    self._data_logger.warning(f"Requested {pca_features} PCA components, but only {X.shape[1]} features available. Using {max_components} components instead.")
+                    pca_features = X.shape[1]
+                X_pca, pca_model = self._pcaInst.smlp_pca(X[feat_names], y, pca_features , spec_path)
+                X_reversed = self._pcaInst.inverse_transform(X_pca, X)
+
+                # This is to compare the efficiency of reversing PCA, can comment this out
+                self._data_logger.info(f"Comparison between initial input matrix and PCA attempt to reconstruct it, initial matrix: \n {X} \n Reconstructed matrix: \n{X_reversed}")
+
+                # Calculating variance for information about model efficiency
+                variance = pca_model.explained_variance_ratio_
+                total_variance = sum(pca_model.explained_variance_ratio_)
+                self._data_logger.info(f"PCA Variance Ratio per output: {variance}")
+                self._data_logger.info(f"Total Variance Captured: {total_variance}")
+
+                pca_equations = self._pcaInst.get_feature_equations(X_pca, X)
+                X = X_pca  
+
+                for rn in resp_names:
+                    model_features_dict[rn] = list(X_pca.columns)  # Use PCA-transformed feature names
+
+                feat_names = list(X.columns)  
         
         # encode levels of categorical features as integers for model training (in feature selection tasks 
         # it is best to use the original categorical features). 
@@ -994,7 +1095,7 @@ def _prepare_data_for_modeling(self, data_file:str, is_training:bool, split_test
             X_train, y_train, X_test, y_test = self._split_data_for_training(X, y, split_test, 
                 train_first_n, train_random_n, train_uniform_n)
             res = X, y, X_train, y_train, X_test, y_test, mm_scaler_feat, mm_scaler_resp, \
-                feat_names, resp_names, levels_dict, model_features_dict
+                feat_names, resp_names, pca_equations, levels_dict, model_features_dict
         else:
             res = X, y
 
@@ -1003,22 +1104,24 @@ def _prepare_data_for_modeling(self, data_file:str, is_training:bool, split_test
                 
     # Process data to prepare components required for training models and prediction, and reporting results in
     # original scale. Supports also prediction and results reporting in origibal scale from saved model
-    def process_data(self, report_file_prefix:str, data_file:str, new_data_file:str, is_training:bool, split_test, 
+    def process_data(self, report_file_prefix:str, data_file:str, new_data_file:str, spec_path:str, is_training:bool, split_test, 
             feat_names:list[str], resp_names:list[str], keep_feat:list[str],  
             train_first_n:int, train_random_n:int, train_uniform_n:int, interactive_plots:bool, response_plots:bool,
-            scaler_type:str, scale_features:bool, scale_responses:bool, impute_responses:bool, mrmr_features_n:int, 
-            pos_value, neg_value, resp_map:str, resp_to_bool, save_model:bool, use_model:bool):
+            scaler_type:str, scale_features:bool, scale_responses:bool, impute_responses:bool, feat_select_model:str,
+            feat_select_count:int, pca_features_n:int, pos_value, neg_value,
+            resp_map:str, resp_to_bool, save_model:bool, use_model:bool):
         
         #scale = not self._get_data_scaler(scaler_type) is None
         keep_feat = keep_feat + self._specInst.get_spec_constraint_vars()
         if data_file is not None:
             split_test = self._DEF_SPLIT_TEST if split_test is None else split_test
             X, y, X_train, y_train, X_test, y_test, mm_scaler_feat, mm_scaler_resp, \
-            feat_names, resp_names, levels_dict, model_features_dict = self._prepare_data_for_modeling(
-                data_file, True, split_test, feat_names, resp_names, keep_feat, report_file_prefix, 
+            feat_names, resp_names, pca_equations, levels_dict, model_features_dict = self._prepare_data_for_modeling(
+                data_file, spec_path, True, split_test, feat_names, resp_names, keep_feat, report_file_prefix, 
                 train_first_n, train_random_n, train_uniform_n, interactive_plots, response_plots,
-                mrmr_features_n, pos_value, neg_value, resp_map, resp_to_bool, scaler_type, 
-                scale_features, scale_responses, impute_responses, None, None, None, None)
+                feat_select_model, feat_select_count, pca_features_n, pos_value,
+                neg_value, resp_map, resp_to_bool, scaler_type, scale_features, scale_responses,
+                impute_responses, None, None, None, None)
             
             # santy check that: mm_scaler_feat is not None --> scaler_type != 'none'
             assert not scaler_type == 'none' or mm_scaler_feat is None
@@ -1040,13 +1143,14 @@ def process_data(self, report_file_prefix:str, data_file:str, new_data_file:str,
             model_features_dict = self._load_model_features(self.model_features_dict_file)
             feat_names = lists_union_order_preserving_without_duplicates(list(model_features_dict.values()))
             #print('model_features_dict loaded', model_features_dict); print('feat_names loaded', feat_names)
-            X, y, X_train, y_train, X_test, y_test =  None, None, None, None, None, None 
+            X, y, X_train, y_train, X_test, y_test , pca_equations =  None, None, None, None, None, None 
         
         if new_data_file is not None:
             X_new, y_new = self._prepare_data_for_modeling(
-                new_data_file, False, None, feat_names, resp_names, keep_feat, report_file_prefix,  None, None, None, 
-                interactive_plots, response_plots, mrmr_features_n, pos_value, neg_value, resp_map, resp_to_bool, scaler_type,
-                scale_features, scale_responses, impute_responses, mm_scaler_feat, mm_scaler_resp, levels_dict, model_features_dict)
+                new_data_file, spec_path, False, None, feat_names, resp_names, keep_feat, report_file_prefix,  None, None, None, 
+                interactive_plots, response_plots, feat_select_model, feat_select_count, pca_features_n,
+                pos_value, neg_value, resp_map, resp_to_bool, scaler_type,scale_features, scale_responses,
+                impute_responses, mm_scaler_feat, mm_scaler_resp, levels_dict, model_features_dict)
         else:
             X_new, y_new = None, None
 
@@ -1059,4 +1163,4 @@ def process_data(self, report_file_prefix:str, data_file:str, new_data_file:str,
             X_new = X_new[common_features]
         
         return X, y, X_train, y_train, X_test, y_test, X_new, y_new, mm_scaler_feat, mm_scaler_resp, \
-            levels_dict, model_features_dict, feat_names, resp_names
+            levels_dict, pca_equations, model_features_dict, feat_names, resp_names
diff --git a/src/smlp_py/smlp_flows.py b/src/smlp_py/smlp_flows.py
index d29f1f96..85a23548 100644
--- a/src/smlp_py/smlp_flows.py
+++ b/src/smlp_py/smlp_flows.py
@@ -299,13 +299,14 @@ def smlp_flow(self):
             #self.logger.info('Running SMLP in mode "{}": Start'.format(args.analytics_mode))
             self.logger.info('PREPARE DATA FOR MODELING')
             X, y, X_train, y_train, X_test, y_test, X_new, y_new, mm_scaler_feat, mm_scaler_resp, \
-            levels_dict, model_features_dict, feat_names, resp_names = self.dataInst.process_data(
-                self.configInst.report_file_prefix, self.data_fname, self.new_data_fname, True, args.split_test, 
+            levels_dict, pca_equations, model_features_dict, feat_names, resp_names = self.dataInst.process_data(
+                self.configInst.report_file_prefix, self.data_fname, self.new_data_fname, args.spec, True, args.split_test, 
                 feat_names, resp_names, args.keep_features, args.train_first_n, args.train_random_n, args.train_uniform_n, 
                 args.interactive_plots, args.response_plots, args.data_scaler,
-                args.scale_features, args.scale_responses, args.impute_responses, args.mrmr_feat_count_for_prediction, 
+                args.scale_features, args.scale_responses, args.impute_responses, args.feature_selection_model,
+                args.feature_selection_count, args.pca_feat_count_for_prediction,
                 args.positive_value, args.negative_value, args.response_map, args.response_to_bool, args.save_model, args.use_model)
-
+            
             # sanity check that the order of features in model_features_dict, feat_names, X_train, X_test, X is 
             # the same; this is mostly important for model exploration modes 
             self.modelInst.model_features_sanity_check(model_features_dict, feat_names, X_train, X_test, X)
@@ -388,7 +389,7 @@ def smlp_flow(self):
             elif args.analytics_mode == 'optimize':
                 self.optInst.smlp_optimize(syst_expr_dict, args.model, model,
                     self.dataInst.unscaled_training_features, self.dataInst.unscaled_training_responses, 
-                    model_features_dict, feat_names, resp_names, objv_names, objv_exprs, args.optimize_pareto, 
+                    model_features_dict, pca_equations, feat_names, resp_names, objv_names, objv_exprs, args.optimize_pareto, 
                     args.optimization_strategy, quer_names, quer_exprs, 
                     delta_dict, args.epsilon, alpha_global_expr, beta_expr, args.eta, theta_radii_dict, 
                     args.solver_logic, args.vacuity_check, 
diff --git a/src/smlp_py/smlp_mrmr.py b/src/smlp_py/smlp_mrmr.py
index 2ed66326..3a7ca811 100644
--- a/src/smlp_py/smlp_mrmr.py
+++ b/src/smlp_py/smlp_mrmr.py
@@ -11,9 +11,6 @@ def __init__(self):
         self._MRMR_FEATURES_PRED = 15
         self._MRMR_FEATURES_CORR = 15
         self.mrmr_params_dict = {
-            'mrmr_feat_count_for_prediction': {'abbr':'mrmr_pred', 'default':self._MRMR_FEATURES_PRED, 'type':int,
-                'help':'Count of features selected by MRMR algorithm for predictive models '  +
-                    '[default: {}]'.format(str(self._MRMR_FEATURES_PRED))},
             'mrmr_feat_count_for_correlation': {'abbr':'mrmr_corr', 'default':self._MRMR_FEATURES_CORR, 'type':int,
                 'help':'Count of features selected by MRMR algorithm for correlation analysis '  +
                     '[default: {}]'.format(str(self._MRMR_FEATURES_CORR))}
@@ -63,7 +60,7 @@ def _mrmr_regres(self, X:pd.DataFrame, y:pd.Series, K:int, relevance='f', redund
                                str(y.name) + ' :\n'+ str(mrmr_scores_df))
         
         self._mrmr_logger.info('MRMR feature selection for response ' + y.name + ' : end') 
-        return mrmr_res[0], mrmr_scores_df
+        return mrmr_res[0], mrmr_scores_df, mrmr_res[1]
 
     # mrmr feature selection using mrmr-feature package, where y is a categorical variable (pandas.Series)
     # TODO !!!: not tested
@@ -87,7 +84,7 @@ def _mrmr_class(self, X:pd.DataFrame, y:pd.Series, K:int, relevance='f',
                                y.name + ' :\n'+ str(mrmr_scores_df))
         
         self._mrmr_logger.info('MRMR feature selection for response ' + y.name + ' : end')
-        return mrmr_res[0], mrmr_scores_df
+        return mrmr_res[0], mrmr_scores_df, mrmr_res[1]
     
     def smlp_mrmr(self, X:pd.DataFrame, y:pd.Series, #resp_type:str, #"numeric", 
             feat_cnt:int):
diff --git a/src/smlp_py/smlp_optimize.py b/src/smlp_py/smlp_optimize.py
index 4ed0a29c..b8a03093 100644
--- a/src/smlp_py/smlp_optimize.py
+++ b/src/smlp_py/smlp_optimize.py
@@ -1096,8 +1096,8 @@ def check_synthesis_feasibility(self, feasibility:bool, objv_names:list[str], ob
     #     function (and to smlp_optsyn() instead of passing X,y; The bounds on objectives are not strictly necessary,
     #     any approximation may be used, but accurate approximation might reduce iterations count needed for
     #     computing optimal confoguurations (in optimize and optsyn modes)
-    def smlp_optimize(self, syst_expr_dict:dict, algo:str, model:dict, X:pd.DataFrame, y:pd.DataFrame, model_features_dict:dict, 
-            feat_names:list[str], resp_names:list[str], 
+    def smlp_optimize(self, syst_expr_dict:dict, algo:str, model:dict, X:pd.DataFrame, y:pd.DataFrame, model_features_dict:dict,
+            pca_equations:dict, feat_names:list[str], resp_names:list[str], 
             objv_names:list[str], objv_exprs, pareto:bool, strategy:str, #asrt_names:list[str], asrt_exprs, 
             quer_names:list[str], quer_exprs, delta:float, epsilon:float, 
             alph_expr:str, beta_expr:str, eta_expr:str, theta_radii_dict:dict, solver_logic:str, vacuity:bool, 
@@ -1115,7 +1115,7 @@ def smlp_optimize(self, syst_expr_dict:dict, algo:str, model:dict, X:pd.DataFram
             
         domain, syst_term_dict, model_full_term_dict, eta, alpha, beta, interface_consistent, model_consistent = \
         self._modelTermsInst.create_model_exploration_base_components(
-            syst_expr_dict, algo, model, model_features_dict, feat_names, resp_names, 
+            syst_expr_dict, algo, model, model_features_dict, pca_equations, feat_names, resp_names, 
             #delta, epsilon, #objv_names, objv_exprs, None, None, None, None, 
             alph_expr, beta_expr, eta_expr, data_scaler, scale_feat, scale_resp, #scale_objv, 
             float_approx, float_precision, data_bounds_json_path)
diff --git a/src/smlp_py/smlp_pca.py b/src/smlp_py/smlp_pca.py
new file mode 100644
index 00000000..89ccaa4f
--- /dev/null
+++ b/src/smlp_py/smlp_pca.py
@@ -0,0 +1,225 @@
+# SPDX-License-Identifier: Apache-2.0
+# This file is part of smlp.
+
+import numpy as np
+import pandas as pd
+import os
+from sklearn.decomposition import PCA
+import json
+#from sklearn.preprocessing import StandardScaler
+
+class SmlpPCA:
+    def __init__(self):
+        self._pca_logger = None
+        self.pca_model = None
+        self._DEF_PCA_FEATURES_PRED = 0
+        self.pca_params_dict = {
+            'pca_feat_count_for_prediction': {'abbr':'pca_pred', 'default':self._DEF_PCA_FEATURES_PRED, 'type':int,
+                'help':'Count of features selected by pca algorithm '  +
+                    '[default: {}]'.format(str(self._DEF_PCA_FEATURES_PRED))},
+        }
+    def set_logger(self, logger):
+        self._pca_logger = logger
+        
+
+
+    def create_pca_based_spec(self, X: pd.DataFrame, y: pd.DataFrame, original_spec_path: str):
+        if self.pca_model is None:
+            raise ValueError("PCA model has not been trained. Call smlp_pca() first.")
+
+        self._pca_logger.info('Generating PCA-based spec file: start')
+
+        # Read original spec file
+        with open(original_spec_path, "r") as f:
+            raw_text = f.read().replace('\t', '    ')
+            spec_data = json.loads(raw_text)
+
+        # Prepare output directory
+        base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
+        output_dir = os.path.join(base_dir, "regr_smlp", "models")
+        os.makedirs(output_dir, exist_ok=True)
+
+        # Collect original knob ranges
+        knob_ranges = {}
+        original_variables = spec_data.get('variables', [])
+        for item in original_variables:
+            if item.get('interface') in ['knob', 'input']:
+                knob_ranges[item['label']] = item['range']
+
+        # PCA components and feature information
+        components = self.pca_model.components_
+        feature_means = self.pca_model.mean_
+        feature_names = X.columns.tolist()
+        pca_knobs = []
+        pca_equations_log = ["PCA Component Equations:"]
+        # Create PCA knobs with corrected ranges and save equations
+        for i, component in enumerate(components):
+            pc_name = f"PC{i+1}"
+
+            min_pc, max_pc = 0, 0
+            offset = 0.0
+            terms = []
+
+            for weight, feature in zip(component, feature_names):
+                if feature not in knob_ranges:
+                    continue
+                feature_min, feature_max = knob_ranges[feature]
+                if weight >= 0:
+                    min_pc += weight * feature_min
+                    max_pc += weight * feature_max
+                else:
+                    min_pc += weight * feature_max
+                    max_pc += weight * feature_min
+
+                # Accumulate the offset for mean centering
+                feature_idx = feature_names.index(feature)
+
+                offset -= weight * feature_means[feature_idx]
+
+                # Build equation term
+                if abs(weight) > 1e-6:
+                    terms.append(f"{weight:.4f} * {feature}")
+
+            # Final corrected min/max with offset
+            estimated_min = min(min_pc, max_pc) + offset
+            estimated_max = max(min_pc, max_pc) + offset
+            estimated_range = [round(estimated_min, 4), round(estimated_max, 4)]
+
+            # Create knob
+            pca_knobs.append({
+                "label": pc_name,
+                "interface": "knob",
+                "type": "real",
+                "range": estimated_range,
+                "rad-rel": 0.05
+            })
+
+            # Create readable equation for PCA component
+            equation = " + ".join(terms)
+            equation += f" + {offset:.4f}"
+            pca_equations_log.append(f"{pc_name} ≈ {equation}")
+
+        # Keep outputs unchanged
+        output_variables = []
+        for item in original_variables:
+            if item.get('interface') == 'output':
+                output_variables.append(item)
+
+        # Combine into new spec
+        new_variables = pca_knobs + output_variables
+        new_spec = {
+            "version": spec_data.get("version", "1.2"),
+            "variables": new_variables,
+            "objectives": spec_data.get("objectives", {})
+        }
+
+        # Save new PCA spec
+        output_spec_path = os.path.join(output_dir, "pca_generated.spec")
+        with open(output_spec_path, "w") as f:
+            json.dump(new_spec, f, indent=4)
+
+        self._pca_logger.info(f"PCA-based spec saved to {output_spec_path}")
+
+        # Save PCA component equations
+        pca_equations_path = os.path.join(output_dir, "pca_component_equations.txt")
+        with open(pca_equations_path, "w") as f:
+            f.write("\n".join(pca_equations_log))
+
+        self._pca_logger.info(f"PCA component equations saved to {pca_equations_path}")
+        self._pca_logger.info('Generating PCA-based spec file: end')
+
+        return "OK"
+
+    def smlp_pca(self, X: pd.DataFrame, y: pd.DataFrame, feat_count: int , spec_path: str):
+        
+        if X.shape[1] == 0:
+            return X , None
+
+        self._pca_logger.info('PCA feature compression: start')
+
+        self.pca_model = PCA(n_components=feat_count)
+        X_pca = self.pca_model.fit_transform(X)
+
+        pca_columns = [f'PC{i+1}' for i in range(feat_count)]
+        X_pca_df = pd.DataFrame(X_pca, index=X.index, columns=pca_columns)
+
+        self._pca_logger.info(f'PCA applied: Reduced {X.shape[1]} features to {feat_count} components')
+        self._pca_logger.info('PCA feature compression: end')
+
+        # Go back two levels from smlp_py to project root
+        base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
+
+        # Target output directory
+        output_dir = os.path.join(base_dir, "regr_smlp", "models")
+        os.makedirs(output_dir, exist_ok=True)
+
+        # Save PCA components
+        pca_components_path = os.path.join(output_dir, "pca_components.csv")
+        X_pca_df.to_csv(pca_components_path, index=False)
+        self._pca_logger.info(f"PCA components saved to {pca_components_path}")
+
+        # Save combined PCA components and outputs
+        X_pca_and_outputs = pd.concat([X_pca_df, y.loc[X_pca_df.index]], axis=1)
+        pca_full_data_path = os.path.join(output_dir, "pca_full_data.csv")
+        X_pca_and_outputs.to_csv(pca_full_data_path, index=False)
+        self._pca_logger.info(f"PCA components with outputs saved to {pca_full_data_path}")
+
+        return X_pca_df, self.pca_model
+
+    def inverse_transform(self, X_pca , X):
+        # Converts PCA features back to original feature space using stored PCA model.
+
+        if self.pca_model is None:
+            raise ValueError("PCA model has not been trained. Call smlp_pca() first.")
+
+        X_reconstructed = pd.DataFrame(self.pca_model.inverse_transform(X_pca), index=X_pca.index)
+        X_reconstructed.columns = list(X.columns)
+
+        return X_reconstructed
+
+    def get_feature_equations(self, X_pca, X):
+        # Generate a dictionary mapping each original feature to its equation in terms of PCA-transformed components .
+
+        if self.pca_model is None:
+            raise ValueError("PCA model has not been trained. Call smlp_pca() first.")
+        W = self.pca_model.components_  
+        W_inv = np.linalg.pinv(W)  
+
+        # Get the mean values of original features
+        feature_means = self.pca_model.mean_
+        feature_equations = {}
+        equations_log = ["PCA Equations for Original Features:"]
+
+        for i, feature_name in enumerate(X.columns):
+            coefficients = W_inv[i, :]
+
+            # Create equation for each feature
+            feature_index = X.columns.get_loc(feature_name)
+            equation = " + ".join(f"{coeff:.4f} * {pc}" for coeff, pc in zip(coefficients, X_pca.columns))
+            equation += f" + {feature_means[feature_index]:.4f}"
+            equation_terms = {pc: coeff for pc, coeff in zip(X_pca.columns, coefficients)}
+
+            # Store each output equation
+            feature_equations[feature_name] = {
+                "terms": equation_terms,
+                "offset": feature_means[i]
+            }
+
+            equations_log.append(f"{feature_name} ≈ {equation}")
+        # Logs all equations, can be commented out if not relevant
+        self._pca_logger.info("\n".join(equations_log))
+
+        base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
+
+        # Target output directory
+        output_dir = os.path.join(base_dir, "regr_smlp", "models")
+        os.makedirs(output_dir, exist_ok=True)
+        # Save PCA feature equations
+        pca_equations_path = os.path.join(output_dir, "pca_equations.txt")
+        with open(pca_equations_path, "w") as f:
+            f.write("\n".join(equations_log))
+        self._pca_logger.info(f"PCA feature equations saved to {pca_equations_path}")
+
+
+        return feature_equations
+
diff --git a/src/smlp_py/smlp_shap.py b/src/smlp_py/smlp_shap.py
new file mode 100644
index 00000000..5eee0852
--- /dev/null
+++ b/src/smlp_py/smlp_shap.py
@@ -0,0 +1,68 @@
+import shap
+import pandas as pd
+import numpy as np
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.metrics import r2_score
+
+class SmlpShap:
+    def __init__(self):
+        self._shap_logger = None
+        self._SHAP_FEATURES_PRED = 15
+        self._DEF_ESTIMATOR_NUMBER = 100
+
+    def set_logger(self, logger):
+        self._shap_logger = logger
+    
+    def smlp_shap(self, X: pd.DataFrame, y: pd.Series, feat_cnt: int):
+        if feat_cnt == 0 or X.shape[1] == 0:
+            return [], np.zeros(X.shape[1])
+        self._shap_logger.info(f'SHAP feature selection for response {y.name}: start')
+
+
+        # Code added in order to reduce training time
+        X_sample = X.sample(n=min(1000, len(X)), random_state=42)
+        y_sample = y.loc[X_sample.index]
+
+        # Train the model
+        model = RandomForestRegressor(n_estimators=self._DEF_ESTIMATOR_NUMBER, n_jobs=-1)
+        model.fit(X_sample, y_sample)
+
+        r2 = r2_score(y_sample, model.predict(X_sample))
+        if r2 < 0.01:
+            self._shap_logger.warning(f"Model R² score for response {y.name} is very low (R² = {r2:.5f}); SHAP results may be uninformative.")
+            return [], np.zeros(X_sample.shape[1])
+
+
+        # Option 1: KernelExplainer (model independent, very slow)
+        # This is extremely slow for large datasets.
+        # explainer = shap.KernelExplainer(model.predict, X_sample)
+
+        # Option 2: API Explainer (automatically selects an explainer type, slow for tree models)
+        # explainer = shap.Explainer(model.predict, X_sample, feature_perturbation="interventional")
+
+        # Option 3 (recommended): TreeExplainer (fast and gives similar results to Kernel and API explainers in testing)
+        explainer = shap.TreeExplainer(model)
+        
+        # Compute mean absolute SHAP value per feature
+        shap_values = explainer.shap_values(X_sample)
+        if isinstance(shap_values, list):
+            shap_importance = np.abs(np.array(shap_values)).mean(axis=(0, 1))
+        else:
+            shap_importance = np.abs(shap_values).mean(axis=0)
+        
+        if np.all(shap_importance == 0):
+            self._shap_logger.warning(f"All SHAP values are 0 for response {y.name}; skipping SHAP feature selection.")
+            return [], shap_importance
+        
+        # Rank features by SHAP importance in descending order
+        feature_importance = pd.DataFrame({
+            'Feature': X_sample.columns,
+            'SHAP_Importance': shap_importance
+        }).sort_values(by='SHAP_Importance', ascending=False)
+
+        selected_features = feature_importance['Feature'].iloc[:feat_cnt].tolist()
+        
+        self._shap_logger.info(f'SHAP selected feature scores for response {y.name}:{feature_importance}')
+        self._shap_logger.info(f'SHAP feature selection for response {y.name}: end')
+
+        return selected_features , shap_importance
\ No newline at end of file
diff --git a/src/smlp_py/smlp_terms.py b/src/smlp_py/smlp_terms.py
index ab0a6ddf..435390ab 100644
--- a/src/smlp_py/smlp_terms.py
+++ b/src/smlp_py/smlp_terms.py
@@ -2051,7 +2051,7 @@ def _compute_model_terms_dict(self, algo, model, feat_names, resp_names, data_bo
     def compute_models_terms_dict(self, algo, model_or_model_dict, model_features_dict, feat_names, resp_names, 
             data_bounds, data_scaler,scale_features, scale_responses):
         #print('model_features_dict', model_features_dict); print('feat_names', feat_names, 'resp_names', resp_names, flush=True)
-        assert lists_union_order_preserving_without_duplicates(list(model_features_dict.values())) == feat_names
+        assert set(lists_union_order_preserving_without_duplicates(list(model_features_dict.values()))) == set(feat_names)
         #print('model_or_model_dict', model_or_model_dict)
         if isinstance(model_or_model_dict, dict):
             models_full_terms_dict = {}
@@ -2355,7 +2355,7 @@ def var_domain(self, var, spec_domain_dict):
         return var_component
     
     # this function builds terms and formulas for constraints, system description and the models
-    def create_model_exploration_base_components(self, syst_expr_dict:dict, algo, model, model_features_dict:dict, feat_names:list, resp_names:list, 
+    def create_model_exploration_base_components(self, syst_expr_dict:dict, algo, model, model_features_dict:dict, pca_equations:dict, feat_names:list, resp_names:list, 
             alph_expr:str, beta_expr:str, eta_expr:str, data_scaler, scale_feat, scale_resp, 
             float_approx=True, float_precision=64, data_bounds_json_path=None):
         self._smlp_terms_logger.info('Creating model exploration base components: Start')
@@ -2373,14 +2373,26 @@ def create_model_exploration_base_components(self, syst_expr_dict:dict, algo, mo
         
         # contraints on features used as control variables and on the responses
         alph_ranges = self.compute_input_ranges_formula_alpha_eta('alpha', feat_names); #print('alph_ranges')
-        alph_global = self.compute_global_alpha_formula(alph_expr, feat_names); #print('alph_global')
-        alpha = self.smlp_and(alph_ranges, alph_global); #print('alpha')
-        beta = self.compute_beta_formula(beta_expr, feat_names+resp_names); #print('beta')
-        eta_ranges = self.compute_input_ranges_formula_alpha_eta('eta', feat_names); #print('eta_ranges')
-        eta_grids = self.compute_grid_range_formulae_eta(feat_names); #print('eta_grids')
-        eta_global = self.compute_eta_formula(eta_expr, feat_names); #print('eta_global', eta_global)
-        eta = self.smlp_and_multi([eta_ranges, eta_grids, eta_global]); #print('eta', eta)
-        
+
+        if pca_equations is None:
+            alph_global = self.compute_global_alpha_formula(alph_expr, feat_names); #print('alph_global')
+            alpha = self.smlp_and(alph_ranges, alph_global); #print('alpha')
+            beta = self.compute_beta_formula(beta_expr, feat_names+resp_names); #print('beta')
+            eta_ranges = self.compute_input_ranges_formula_alpha_eta('eta', feat_names); #print('eta_ranges')
+            eta_grids = self.compute_grid_range_formulae_eta(feat_names); #print('eta_grids')
+            eta_global = self.compute_eta_formula(eta_expr, feat_names); #print('eta_global', eta_global)
+            eta = self.smlp_and_multi([eta_ranges, eta_grids, eta_global]); #print('eta', eta)
+        else:
+            alph_global = True
+            alpha = True
+            beta = True
+            eta_ranges = True
+            eta_grids = True
+            eta_global = True
+            eta = True
+        if pca_equations is not None:
+            self._smlp_terms_logger.warn('PCA is currently under development, so alpha, beta and eta constraints will be ignored.')
+            
         self._smlp_terms_logger.info('Alpha global   constraints: ' + str(alph_global))
         self._smlp_terms_logger.info('Alpha ranges   constraints: ' + str(alph_ranges))
         self._smlp_terms_logger.info('Alpha combined constraints: ' + str(alpha))
diff --git a/src/smlp_py/train_sklearn.py b/src/smlp_py/train_sklearn.py
index edbf7b59..080830fb 100644
--- a/src/smlp_py/train_sklearn.py
+++ b/src/smlp_py/train_sklearn.py
@@ -465,6 +465,8 @@ def sklearn_main(self, get_model_file_prefix, feat_names_dict, resp_names, algo,
         else:
             #union_feat_names = list(set(sum(list(feat_names_dict.values()), []))) - the ordering is affected 
             union_feat_names = lists_union_order_preserving_without_duplicates(list(feat_names_dict.values()))
+            # Added check 
+            union_feat_names = sorted(union_feat_names, key=lambda x: X_train.columns.tolist().index(x))
             model = self._sklearn_train_multi_response(get_model_file_prefix, union_feat_names, resp_names, algo,
                 X_train, X_test, y_train, y_test, hparam_dict, interactive_plots, 
                 seed, sample_weights_vect)
diff --git a/src/smlp_train.py b/src/smlp_train.py
index ba668708..ae22a4b5 100755
--- a/src/smlp_train.py
+++ b/src/smlp_train.py
@@ -112,10 +112,10 @@ def main(argv):
     logger.info('PREPARE DATA FOR MODELING')    
     X, y, X_train, y_train, X_test, y_test, X_new, y_new, mm_scaler_feat, mm_scaler_resp, \
     levels_dict, model_features_dict = dataInst.process_data(
-        inst, inst.data_fname, inst.new_data_fname, True, args.split_test, feat_names, resp_names, 
+        inst, inst.data_fname, inst.new_data_fname, args.spec, True, args.split_test, feat_names, resp_names, 
         args.train_first_n, args.train_random_n, args.train_uniform_n, args.interactive_plots, 
-        args.data_scaler, args.mrmr_feat_count_for_prediction,
-        args.save_model, args.use_model)
+        args.data_scaler, args.feature_selection_model, args.feature_selection_count, 
+        args.pca_feat_count_for_prediction, args.save_model, args.use_model)
     
     # model training, validation, testing, prediction on training and labeled data as well as new data when available.    
     model = modelInst.build_models(inst, args.model, X, y, X_train, y_train, X_test, y_test, X_new, y_new,