-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtrain.py
More file actions
88 lines (62 loc) · 2.9 KB
/
train.py
File metadata and controls
88 lines (62 loc) · 2.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import warnings
import numpy as np
import datetime
import pandas_datareader.data as web
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import mlflow.sklearn
def acquire_training_data():
start = datetime.datetime(2019, 7, 1)
end = datetime.datetime(2019, 9, 30)
df = web.DataReader("BTC-USD", 'yahoo', start, end)
return df
def digitize(n):
if n > 0:
return 1
return 0
def rolling_window(a, window):
"""
Takes np.array 'a' and size 'window' as parameters
Outputs an np.array with all the ordered sequences of values of 'a' of size 'window'
e.g. Input: ( np.array([1, 2, 3, 4, 5, 6]), 4 )
Output:
array([[1, 2, 3, 4],
[2, 3, 4, 5],
[3, 4, 5, 6]])
"""
shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
strides = a.strides + (a.strides[-1],)
return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
def prepare_training_data(data):
"""
Return a prepared numpy dataframe
input : Dataframe with expected schema
"""
data['Delta'] = data['Close'] - data['Open']
data['to_predict'] = data['Delta'].apply(lambda d: digitize(d))
return data
if __name__ == "__main__":
warnings.filterwarnings("ignore")
with mlflow.start_run():
training_data = acquire_training_data()
prepared_training_data_df = prepare_training_data(training_data)
btc_mat = prepared_training_data_df.as_matrix()
WINDOW_SIZE = 14
X = rolling_window(btc_mat[:, 7], WINDOW_SIZE)[:-1, :]
Y = prepared_training_data_df['to_predict'].as_matrix()[WINDOW_SIZE:]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=4284, stratify=Y)
clf = RandomForestClassifier(bootstrap=True, criterion='gini', min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=50, random_state=4284, verbose=0)
clf.fit(X_train, y_train)
predicted = clf.predict(X_test)
mlflow.sklearn.log_model(clf, "model_random_forest")
print(classification_report(y_test, predicted))
mlflow.log_metric("precision_label_0", precision_score(y_test, predicted, pos_label=0))
mlflow.log_metric("recall_label_0", recall_score(y_test, predicted, pos_label=0))
mlflow.log_metric("f1score_label_0", f1_score(y_test, predicted, pos_label=0))
mlflow.log_metric("precision_label_1", precision_score(y_test, predicted, pos_label=1))
mlflow.log_metric("recall_label_1", recall_score(y_test, predicted, pos_label=1))
mlflow.log_metric("f1score_label_1", f1_score(y_test, predicted, pos_label=1))