Perosolar/ML_saveCSV_FE_V2.py at main · tccdem/Perosolar · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import os
import os.path

os.mkdir('prediction_test')
os.mkdir('prediction_train')
os.mkdir('errors')
os.mkdir('models')
os.mkdir('feature-impo')
for n in range(0, 300):
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn import ensemble
    from sklearn.metrics import mean_squared_error
    from sklearn.metrics import r2_score
    from sklearn.metrics import mean_absolute_error
    from sklearn.ensemble import GradientBoostingRegressor
    from sklearn.utils import shuffle
    import sys
    import os
    import pickle
    import pandas as pd

    df = pd.read_csv("4181cubicnoF-14f.csv")
    # num_of_feature = len(df.columns) - 1
    feature_names = df.columns.values[1:]  # eliminate structure names
    X = df.iloc[:, 1:-1]
    y = df.iloc[:, -1]
    X, y = shuffle(X, y)
    ################################################
    X = X.astype(np.float32)
    offset = int(X.shape[0] * 0.8)
    X_train, y_train = X[:offset], y[:offset]
    X_test, y_test = X[offset:], y[offset:]
    # print(X,y)
    # Model
    # create and fit the best regression model
    # work2-11features params-kfold2
    params = {'learning_rate': 0.1, 'loss': 'ls', 'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 2,
              'n_estimators': 200, 'subsample': 0.8}
    # work2-11features params-kfold2
    # params={'learning_rate': 0.1, 'loss': 'huber', 'max_depth': 6, 'min_samples_leaf': 3, 'min_samples_split': 3,
    # 'n_estimators': 300, 'subsample': 0.8}
    # params={'learning_rate': 0.08, 'loss': 'huber', 'max_depth': 6, 'min_samples_leaf': 5, 'min_samples_split': 2,
    # 'n_estimators': 800, 'subsample': 0.85}
    best_model = ensemble.GradientBoostingRegressor(**params)
    best_model.fit(X_train, y_train)
    import os

    cwd = os.getcwd()
    save_path = cwd + r'/models'
    filename = "model_{}.sav".format(n)
    completeName = os.path.join(save_path, filename)
    pickle.dump(best_model, open(completeName, 'wb'))
    # make predictions using the model
    predictions_test = best_model.predict(X_test)
    predictions_train = best_model.predict(X_train)
    # print(dir(best_model))
    # print(best_model.get_params())
    # exit()
    # save errors for test with model name in a text file
    MSE_test = round(mean_squared_error(y_test, predictions_test), 3)
    R2_test = round(r2_score(y_test, predictions_test), 3)
    RMSD_test = round(mean_squared_error(y_test, predictions_test, squared=False), 3)
    MAD_test = round(mean_absolute_error(y_test, predictions_test), 3)
    # save errors for train with model name in a text file
    MSE_train = round(mean_squared_error(y_train, predictions_train), 3)
    R2_train = round(r2_score(y_train, predictions_train), 3)
    RMSD_train = round(mean_squared_error(y_train, predictions_train, squared=False), 3)
    MAD_train = round(mean_absolute_error(y_train, predictions_train), 3)

    # 'mkdir' creates a directory in current directory.
    save_path = cwd + r'/errors'
    file_name = "accuracies_model_{}.txt".format(n)
    completeName = os.path.join(save_path, file_name)
    file1 = open(completeName, "w")
    # file = open("accuracies_model_{}.txt".format(n),"w")
    file1.write('model_%d' % n + ':' + '\n' + 'MSE_test=' + str(MSE_test) + '\n' + 'R2_test=' + str(R2_test) + '\n'
                + 'RMSD_test=' + str(RMSD_test) + '\n' + 'MAD_test=' + str(MAD_test) + '\n' + 'MSE_train=' + str(
        MSE_train) + '\n'
                + 'R2_train=' + str(R2_train) + '\n' + 'RMSD_train=' + str(RMSD_train) + '\n' + 'MAD_train=' + str(
        MAD_train)
                )

    # save prediction an real y in separete csv file for test and train for each model
    predictions_train = list(predictions_train)
    y_train = list(y_train)
    predictions_test = list(predictions_test)
    y_test = list(y_test)
    df_test = pd.DataFrame(columns=['predictions_test'])
    df_test = pd.DataFrame(columns=['y_test'])
    df_test['predictions_test'] = predictions_test
    df_test['y_test'] = y_test
    # display(df_test)
    df_test.to_csv(os.path.join('prediction_test', "y_and_pred_test_model_{}.csv".format(n)))
    # df_test.to_csv("y_and_pred_test_model_{}.csv".format(n), index=False)
    df_train = pd.DataFrame(columns=['predictions_train'])
    df_train = pd.DataFrame(columns=['y_train'])
    df_train['predictions_train'] = predictions_train
    df_train['y_train'] = y_train
    df_train.to_csv(os.path.join('prediction_train', "y_and_pred_train_model_{}.csv".format(n)))
    # df_train.to_csv("y_and_pred_train_model_{}.csv".format(n), index=False)
    ########################################################################feature importance
    feature_importance = best_model.feature_importances_
    # make importances relative to max importance
    feature_importance = 100 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0])
    f2 = []
    for m in range(len(sorted_idx)):
        f2.append(feature_names[sorted_idx[m]])
    save_path = cwd + r'/feature-impo'
    file_name = "feature-importance_model_{}.txt".format(n)
    completeName = os.path.join(save_path, file_name)
    file2 = open(completeName, "w")
    # file = open("feature-importance_model_{}.txt".format(n),"w")
    # print(pos,f2,feature_importance[sorted_idx])
    file2.write(str(pos) + '\n' + str(f2) + '\n' + str(feature_importance[sorted_idx]))

file1.close()
file2.close()