forked from smontariol/Semeval2020-Task1
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathensembling_script.py
More file actions
87 lines (67 loc) · 3.98 KB
/
ensembling_script.py
File metadata and controls
87 lines (67 loc) · 3.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
import argparse
import sys
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--language", default='english', const='all', nargs='?',
help="Choose a language", choices=['english', 'latin', 'swedish', 'german'])
parser.add_argument("--method_1", default='aff_prop', type=str,
help="Name of the first clustering method, should be the same as column name in csv with results generated by calculate_semantic_change.py script")
parser.add_argument("--input_file_method_1", default='semeval_results/results_english.csv', type=str,
help="Path to results file generated by calculate_semantic_change.py script")
parser.add_argument("--method_2", default='w2v_dist', type=str,
help="Name of the static embedding method")
parser.add_argument("--input_file_method_2", default='semeval_results/w2v_results_english_OP.tsv"', type=str,
help="Path to results file generated by calculate_semantic_change.py script")
parser.add_argument("--output_file_path", default='ensembling_aff_prop_w2v_dist_english.csv"', type=str,
help="Output file containg ensembling results")
args = parser.parse_args()
lang = args.language
languages = ['english', 'latin', 'swedish', 'german']
if lang not in languages:
print("Language not valid, valid choices are: ", ", ".join(languages))
sys.exit()
method_name1 = args.method_1
method_name2 = args.method_2
clustering_file = args.input_file_method_1
w2v_file = args.input_file_method_2
csv_file = args.output_file_path
# open w2v cosine dist file
w2v_df = pd.read_csv(w2v_file, sep="\t", names=['word','w2v_dist'])
# open BERT clustering file
clustering_df = pd.read_csv(clustering_file, sep="\t")
# only do this when ensembling with word2vec
if lang == 'english':
clustering_df['word'] = clustering_df['word'].apply(lambda x: x.split("_")[0])
# merge dataframes results if needed
df_merged = pd.merge(w2v_df, clustering_df, on='word')
# let's see how words are ranked by the two methods
df_merged = df_merged[['word', method_name1, method_name2]]
df_merged = df_merged.assign(method1_rank=df_merged[method_name1].rank())
df_merged = df_merged.assign(method2_rank=df_merged[method_name2].rank())
# normalize distances for method1 such that the mean is at 0
method1_arr = np.array(df_merged[method_name1])
method1_norm = method1_arr - np.mean(method1_arr)
# do the same for method2
method2_arr = np.array(df_merged[method_name2])
method2_norm = method2_arr - np.mean(method2_arr)
# get the mean of the two normalized distances
ensemble_mean = np.mean([method1_norm, method2_norm], axis=0)
df_merged = df_merged.assign(ensemble_mean=ensemble_mean)
df_merged.to_csv(csv_file, sep='\t', encoding='utf-8', index=False)
print("\n===== Results for", lang.upper(),"=====")
print("Correlation between", method_name1, "and", method_name2)
# compute Spearman and Pearson correlation between methods
print(method_name1, "and", method_name2)
spearman_corr = spearmanr(df_merged[method_name1].astype('float64'), df_merged[method_name2].astype('float64'))
print("Spearman correlation:", spearman_corr[0])
# compute Spearman and Pearson correlation between method1 and the ensembled method
print("Ensemble vs", method_name1)
spearman_corr = spearmanr(df_merged[method_name1].astype('float64'), df_merged['ensemble_mean'].astype('float64'))
print("Spearman correlation:", spearman_corr[0])
# compute Spearman and Pearson correlation between method2 and the ensembled method
print("Ensemble vs", method_name2)
spearman_corr = spearmanr(df_merged[method_name2].astype('float64'), df_merged['ensemble_mean'].astype('float64'))
print("Spearman correlation:", spearman_corr[0])