Semeval2020-Task1/ensembling_script.py at master · umilISLab/Semeval2020-Task1 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85

import pandas as pd
import numpy as np
from scipy.stats import spearmanr
import argparse
import sys


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--language", default='english', const='all', nargs='?',
                        help="Choose a language", choices=['english', 'latin', 'swedish', 'german'])
    parser.add_argument("--method_1", default='aff_prop', type=str,
                        help="Name of the first clustering method, should be the same as column name in csv with results generated by calculate_semantic_change.py script")
    parser.add_argument("--input_file_method_1", default='semeval_results/results_english.csv', type=str,
                        help="Path to results file generated by calculate_semantic_change.py script")
    parser.add_argument("--method_2", default='w2v_dist', type=str,
                        help="Name of the static embedding method")
    parser.add_argument("--input_file_method_2", default='semeval_results/w2v_results_english_OP.tsv"', type=str,
                        help="Path to results file generated by calculate_semantic_change.py script")
    parser.add_argument("--output_file_path", default='ensembling_aff_prop_w2v_dist_english.csv"', type=str,
                        help="Output file containg ensembling results")
    args = parser.parse_args()

    lang = args.language
    languages = ['english', 'latin', 'swedish', 'german']
    if lang not in languages:
        print("Language not valid, valid choices are: ", ", ".join(languages))
        sys.exit()

    method_name1 = args.method_1
    method_name2 = args.method_2

    clustering_file = args.input_file_method_1
    w2v_file = args.input_file_method_2
    csv_file = args.output_file_path

    # open w2v cosine dist file
    w2v_df = pd.read_csv(w2v_file, sep="\t", names=['word','w2v_dist'])

    # open BERT clustering file
    clustering_df = pd.read_csv(clustering_file, sep="\t")
    # only do this when ensembling with word2vec
    if lang == 'english':
         clustering_df['word'] = clustering_df['word'].apply(lambda x: x.split("_")[0])

    # merge dataframes results if needed
    df_merged = pd.merge(w2v_df, clustering_df, on='word')

    # let's see how words are ranked by the two methods
    df_merged = df_merged[['word', method_name1, method_name2]]
    df_merged = df_merged.assign(method1_rank=df_merged[method_name1].rank())
    df_merged = df_merged.assign(method2_rank=df_merged[method_name2].rank())

    # normalize distances for method1 such that the mean is at 0
    method1_arr = np.array(df_merged[method_name1])
    method1_norm = method1_arr - np.mean(method1_arr)

    # do the same for method2
    method2_arr = np.array(df_merged[method_name2])
    method2_norm = method2_arr - np.mean(method2_arr)

    # get the mean of the two normalized distances
    ensemble_mean = np.mean([method1_norm, method2_norm], axis=0)

    df_merged = df_merged.assign(ensemble_mean=ensemble_mean)

    df_merged.to_csv(csv_file, sep='\t', encoding='utf-8', index=False)

    print("\n===== Results for", lang.upper(),"=====")
    print("Correlation between", method_name1, "and", method_name2)
    # compute Spearman and Pearson correlation between methods
    print(method_name1, "and", method_name2)
    spearman_corr = spearmanr(df_merged[method_name1].astype('float64'), df_merged[method_name2].astype('float64'))
    print("Spearman correlation:", spearman_corr[0])

    # compute Spearman and Pearson correlation between method1 and the ensembled method
    print("Ensemble vs", method_name1)
    spearman_corr = spearmanr(df_merged[method_name1].astype('float64'), df_merged['ensemble_mean'].astype('float64'))
    print("Spearman correlation:", spearman_corr[0])

    # compute Spearman and Pearson correlation between method2 and the ensembled method
    print("Ensemble vs", method_name2)
    spearman_corr = spearmanr(df_merged[method_name2].astype('float64'), df_merged['ensemble_mean'].astype('float64'))
    print("Spearman correlation:", spearman_corr[0])