1+ import pandas as pd
2+
3+ from typing import List
4+ from sklearn .preprocessing import PolynomialFeatures
5+
6+ from toolkit .machine_learning import ignore_columns_polyfeatures
7+
8+
9+ def test_log_transform_data_ignore ():
10+ df = pd .DataFrame ({'a' :[0 , 0 , 1 , 0 ],
11+ 'b' : [16 , 7 , 6 , 16 ],
12+ 'c' :[61 , 57 , 16 , 36 ],
13+ 'd' :['12' ,'22' ,'13' ,'44' ],
14+ 'e' :['Green' ,'Red' ,'Blue' ,'Yellow' ],
15+ 'f' :[1 , 11 , 23 , 66 ]})
16+
17+ df_processed = ignore_columns_polyfeatures (df , variables_to_ignore = ['a' , 'd' , 'e' ], n = 2 )
18+
19+ pd .testing .assert_frame_equal (df [['a' , 'd' , 'e' ]], df_processed [['a' , 'd' , 'e' ]])
20+
21+
22+
23+ def test_log_transform_data_ignore ():
24+ df = pd .DataFrame ({'a' :[0 , 0 , 1 , 0 ],
25+ 'b' : [16 , 7 , 6 , 16 ],
26+ 'c' :[61 , 57 , 16 , 36 ],
27+ 'd' :['12' ,'22' ,'13' ,'44' ],
28+ 'e' :['Green' ,'Red' ,'Blue' ,'Yellow' ],
29+ 'f' :[1 , 11 , 23 , 66 ]})
30+
31+ df_processed = ignore_columns_polyfeatures (df , variables_to_ignore = ['a' , 'd' , 'e' ], n = 2 )
32+
33+ assert len (df_processed .columns ) == 13
34+
35+
36+
37+
38+
39+
40+
41+
42+
43+
44+
45+
46+
47+
48+
49+
50+
51+ def ignore_columns_polyfeatures (X : pd .DataFrame , variables_to_ignore : List [str ], n : int ) -> pd .DataFrame :
52+ '''
53+ This function takes a dataframe as input and will create n polynomial features for all columns except those specified to ignore
54+ It is intended to be used to ignore binary columns for example and to be included in a Pipeline
55+
56+ Parameters
57+ ----------
58+ X : dataframe
59+ It is the dataset we want to selectively create polynomial features
60+ variables_to_ignore : List[str]
61+ a list of column names to ignore in the polynomial feature creation
62+ n : int
63+ the degree for the polynomial fearture creation
64+
65+
66+ Return
67+ ----------
68+ df : Dataframe with the changes made
69+ '''
70+ X_poly_features = X .drop (columns = variables_to_ignore )
71+
72+ X_ignore = X [variables_to_ignore ].reset_index (drop = True )
73+
74+ poly = PolynomialFeatures (degree = n )
75+
76+ poly_array = poly .fit_transform (X_poly_features )
77+
78+ poly_features_names = poly .get_feature_names_out (X_poly_features .columns )
79+
80+ X_poly_features = pd .DataFrame (poly_array , columns = poly_features_names )
81+
82+ return pd .concat ([X_ignore , X_poly_features ], axis = 1 )
0 commit comments