shape_detection/learn_shapes.py at main · naceee/shape_detection · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
import pandas as pd
import warnings
from computing_persistent_homology import get_persistence_from_df
from persistent_images import create_persistence_image


def train_model(resolution):
    warnings.filterwarnings("ignore")

    data = pd.read_csv(f"persistence_images_data/data_n=100_resolution={resolution}.csv",
                       index_col="Unnamed: 0")
    X = data.loc[:, data.columns != 'result']
    y = data["result"]

    clf = RandomForestClassifier(bootstrap=True, criterion="gini", max_depth=16,
                                 n_estimators=4000)
    y_pred = cross_val_predict(clf, X, y, cv=5)

    classes = set(y)
    d = {c: i for i, c in enumerate(classes)}
    predictions = np.zeros((len(classes), len(classes)))
    for actual, predicted in zip(y, y_pred):
        predictions[d[actual], d[predicted]] += 1

    acc = np.trace(predictions) / np.sum(predictions)

    print(f"resolution: {resolution}, acc: {acc}")

    plt.rcParams["figure.figsize"] = (7, 7)
    plt.matshow(predictions)
    plt.xticks(list(range(len(d.keys()))), d.keys(), rotation=45)
    plt.yticks(list(range(len(d.keys()))), d.keys(), rotation=0)
    plt.savefig(f"images/acc_r={r}.pdf")
    plt.show()

    return acc


def predicting_cyclists(resolution=4):
    data = pd.read_csv(f"persistence_images_data/4D_data_n=100_resolution={resolution}.csv",
                       index_col="Unnamed: 0")
    data = data.dropna()
    X = data.loc[:, data.columns != 'result']
    y = data["result"]

    cyclists_df = pd.read_csv("cyclists.csv", index_col="Unnamed: 0")
    teams = set(cyclists_df["team"])

    clf = RandomForestClassifier(bootstrap=True, criterion="gini", max_depth=20,
                                 n_estimators=1000)
    clf.fit(X, y)

    print(f"{clf.classes_}")
    for team in teams:
        team_df = cyclists_df.loc[cyclists_df['team'] == team]
        team_df = team_df[["MOUNTAIN", "TIMETRAIL", "SPRINT", "COBBLE"]]

        team_persistence = get_persistence_from_df(team_df, max_dimensions=4)
        team_persistence.to_csv(f"cyclists_persistences/{team}.csv")

        persistence_image = create_persistence_image(f"cyclists_persistences/{team}.csv",
                                                     plot=False, resolution=resolution)

        print(team, clf.predict_proba(persistence_image.reshape(1, -1))[0])


if __name__ == "__main__":
    # predicting_cyclists(resolution=4)
    all_scores = []
    for r in range(2, 10):
        acc = train_model(r)