-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcolumnwise_tests.py
More file actions
205 lines (169 loc) · 7.03 KB
/
columnwise_tests.py
File metadata and controls
205 lines (169 loc) · 7.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This module contains a function to calculate parwise-test statistics
for multiclass data-tables.
Can be run as command-line script.
"""
import numpy as np
import pandas as pd
from scipy import stats
def pairwise_test(first_data, second_data, test, nan_action='skip',
paired=False, *args, **kwargs):
""" Test columns of first_data against columns of second_data.
Additional arguments are passed to test-function used.
Signature of `test`is assumed to be::
test(x: array_like, y: array_like, *args, **kwargs) -> float, float
Parameters
----------
first_data, second_data : array_like
Datasets with n columns, columns must match.
test : callable
Which test to use.
nan_action : str
'skip' | 'omit_nans'
*args
Additional arguments passed to `test`
**kwargs
Addition keyword arguments passed to `test`
Returns
-------
statistics : array_like
One-dimensional array with test-statistics (length n)
pvalues : array_like
One-dimensional array with p-values (lenght n)
"""
if not nan_action in ('skip', 'omit_nan'):
raise ValueError('unknown nan_action: {0}'.format(nan_action))
if nan_action == 'omit_nan' and len(first_data) != len(second_data) and paired:
raise ValueError('cannot omit NaNs with different number of rows')
if type(first_data) == type(second_data) == type(pd.DataFrame()):
if not (set(first_data.columns) == set(second_data.columns)):
raise ValueError("Datasets don't match")
else:
# If not dataframes, assume that columns match if number of columns
# match.
if not (first_data.shape[1] == second_data.shape[1]):
raise ValueError("Datasets don't match")
first_data = pd.DataFrame(first_data)
second_data = pd.DataFrame(second_data)
statistics = np.zeros((first_data.shape[1], ))
pvalues = np.zeros((first_data.shape[1], ))
for i, column in enumerate(first_data.columns):
raw_x = first_data[column].values
raw_y = second_data[column].values
if nan_action == 'omit_nan':
if paired:
# Omit positions which are NaN in any vector.
missing = np.logical_or(np.isnan(raw_x), np.isnan(raw_y))
x = raw_x[~missing]
y = raw_y[~missing]
else:
x = raw_x[~np.isnan(raw_x)]
y = raw_y[~np.isnan(raw_y)]
elif np.logical_or(np.isnan(raw_x).any(), np.isnan(raw_y).any()):
# If skipping NaN:s, set statistic and p-value to NaN if any
# vector contains any NaN.
statistics[i] = pvalues[i] = np.nan
continue
else:
x = raw_x
y = raw_y
statistic, p = test(x, y, *args, **kwargs)
statistics[i] = statistic
pvalues[i] = p
return statistics, pvalues
def paired_students_t(x, y):
""" Perform a paired student's T test on `x` and `y`.
Parameters
----------
x, y : array_like
One-dimensional vectors containings paired values.
Returns
-------
statistic : float
Paired student's t
p : float
P-value
"""
if x.shape != y.shape:
raise ValueError('x and y do not match')
t_array, p = stats.ttest_rel(x, y)
# The t is returned in zero-dimensional array for some reason.
t = t_array.ravel()[0]
return t, p
if __name__ == '__main__':
import argparse
allowed_tests = ('student-t', 'mann-whitney-u', 'welch-t',
'paired-students-t')
parser = argparse.ArgumentParser(description=(
'This program performs column-wise statistical tests against the '
'null hypothesis that the samples are drawn from the same '
'population.'
))
def check_csv(fname):
if not fname.endswith('.csv'):
raise argparse.ArgumentTypeError('datasets must be csv-file')
else:
return fname
parser.add_argument('datasets', metavar='Data', nargs=2, type=check_csv,
help='csv-files to perform column-wise testing at')
test_help = (
'which test to use, available: {0}. '
'Default: Student-T'
).format(', '.join(test.title() for test in allowed_tests))
parser.add_argument('--test', help=test_help,
choices=allowed_tests, default='student-t')
o_help = 'output file. Defaults to: <data1>_<data2>_<test>.csv'
parser.add_argument('-o', '--output', type=str,
help=o_help)
group = parser.add_mutually_exclusive_group()
omit_help = (
'if set, calculate statistic using non-NaN-values '
'for those columns containing any missing values. '
'Only possible if datasets contain same number of rows.'
)
group.add_argument('--omit_nans', action='store_true',
default=False, help=omit_help)
skip_help = (
'if set, skip calculating statistic in those cases '
'where a column contains missing values.'
)
group.add_argument('--skip', action='store_true',
default=False, help=skip_help)
parser.set_defaults(nan_action='skip', test='student-t')
args = parser.parse_args()
if args.test == 'student-t':
test = stats.ttest_ind
elif args.test == 'mann-whitney-u':
test = stats.mannwhitneyu
elif args.test == 'welch-t':
test = lambda x, y: stats.ttest_ind(x, y, equal_var=False)
elif args.test == 'paired-students-t':
test = paired_students_t
first_data = pd.DataFrame.from_csv(args.datasets[0])
second_data = pd.DataFrame.from_csv(args.datasets[1])
if args.omit_nans and len(first_data) == len(second_data):
nan_action = 'omit_nan'
else:
if args.omit_nans:
msg = ('Warning: Different number of rows. Unable omit NaN:s. '
'Skips columns containing any missing values instead')
print(msg)
nan_action = 'skip'
nan_action = 'omit_nan' if \
(args.omit_nans and len(first_data) == len(second_data)) else 'skip'
paired = args.test in ('paired-students-t', )
statistics, pvalues = pairwise_test(first_data, second_data,
test, nan_action, paired=paired)
results = pd.DataFrame(np.column_stack((statistics, pvalues)),
index=first_data.columns, columns=['statistic', 'p'])
if args.output:
out = args.output
else:
out = '{0}_{1}_{2}.csv'.format(
args.datasets[0].replace('.csv', ''),
args.datasets[1].replace('.csv', ''),
args.test.replace('-', '_')
)
results.to_csv(out if out.endswith('.csv') else out + '.csv')