-
Notifications
You must be signed in to change notification settings - Fork 18
/
patch_evaluation_utils.py
228 lines (203 loc) · 9.29 KB
/
patch_evaluation_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
import numpy as np
import scipy
import scipy.special as special
from scipy.stats._stats import (_kendall_dis, _toint64, _weightedrankedtau,
_local_correlations)
from scipy.stats import *
def _contains_nan(a, nan_policy='propagate'):
policies = ['propagate', 'raise', 'omit']
if nan_policy not in policies:
raise ValueError("nan_policy must be one of {%s}" %
', '.join("'%s'" % s for s in policies))
try:
# Calling np.sum to avoid creating a huge array into memory
# e.g. np.isnan(a).any()
with np.errstate(invalid='ignore'):
contains_nan = np.isnan(np.sum(a))
except TypeError:
# This can happen when attempting to sum things which are not
# numbers (e.g. as in the function `mode`). Try an alternative method:
try:
contains_nan = np.nan in set(a.ravel())
except TypeError:
# Don't know what to do. Fall back to omitting nan values and
# issue a warning.
contains_nan = False
nan_policy = 'omit'
warnings.warn("The input array could not be properly "
"checked for nan values. nan values "
"will be ignored.", RuntimeWarning)
if contains_nan and nan_policy == 'raise':
raise ValueError("The input contains nan values")
return contains_nan, nan_policy
"""
Modified from scipy.stats
"""
def kendalltau_bpq(x, y, initial_lexsort=None, nan_policy='propagate',
method='auto', variant='d'):
"""Calculates Kendall's tau, a correlation measure for ordinal data.
Kendall's tau is a measure of the correspondence between two rankings.
Values close to 1 indicate strong agreement, and values close to -1
indicate strong disagreement. This implements two variants of Kendall's
tau: tau-b (the default) and tau-c (also known as Stuart's tau-c). These
differ only in how they are normalized to lie within the range -1 to 1;
the hypothesis tests (their p-values) are identical. Kendall's original
tau-a is not implemented separately because both tau-b and tau-c reduce
to tau-a in the absence of ties.
Parameters
----------
x, y : array_like
Arrays of rankings, of the same shape. If arrays are not 1-D, they
will be flattened to 1-D.
initial_lexsort : bool, optional
Unused (deprecated).
nan_policy : {'propagate', 'raise', 'omit'}, optional
Defines how to handle when input contains nan.
The following options are available (default is 'propagate'):
* 'propagate': returns nan
* 'raise': throws an error
* 'omit': performs the calculations ignoring nan values
method : {'auto', 'asymptotic', 'exact'}, optional
Defines which method is used to calculate the p-value [5]_.
The following options are available (default is 'auto'):
* 'auto': selects the appropriate method based on a trade-off
between speed and accuracy
* 'asymptotic': uses a normal approximation valid for large samples
* 'exact': computes the exact p-value, but can only be used if no ties
are present. As the sample size increases, the 'exact' computation
time may grow and the result may lose some precision.
variant: {'b', 'c'}, optional
Defines which variant of Kendall's tau is returned. Default is 'b'.
Returns
-------
correlation : float
The tau statistic.
pvalue : float
The two-sided p-value for a hypothesis test whose null hypothesis is
an absence of association, tau = 0.
See Also
--------
spearmanr : Calculates a Spearman rank-order correlation coefficient.
theilslopes : Computes the Theil-Sen estimator for a set of points (x, y).
weightedtau : Computes a weighted version of Kendall's tau.
Notes
-----
The definition of Kendall's tau that is used is [2]_::
tau_b = (P - Q) / sqrt((P + Q + T) * (P + Q + U))
tau_c = 2 (P - Q) / (n**2 * (m - 1) / m)
where P is the number of concordant pairs, Q the number of discordant
pairs, T the number of ties only in `x`, and U the number of ties only in
`y`. If a tie occurs for the same pair in both `x` and `y`, it is not
added to either T or U. n is the total number of samples, and m is the
number of unique values in either `x` or `y`, whichever is smaller.
References
----------
.. [1] Maurice G. Kendall, "A New Measure of Rank Correlation", Biometrika
Vol. 30, No. 1/2, pp. 81-93, 1938.
.. [2] Maurice G. Kendall, "The treatment of ties in ranking problems",
Biometrika Vol. 33, No. 3, pp. 239-251. 1945.
.. [3] Gottfried E. Noether, "Elements of Nonparametric Statistics", John
Wiley & Sons, 1967.
.. [4] Peter M. Fenwick, "A new data structure for cumulative frequency
tables", Software: Practice and Experience, Vol. 24, No. 3,
pp. 327-336, 1994.
.. [5] Maurice G. Kendall, "Rank Correlation Methods" (4th Edition),
Charles Griffin & Co., 1970.
Examples
--------
>>> from scipy import stats
>>> x1 = [12, 2, 1, 12, 2]
>>> x2 = [1, 4, 7, 1, 0]
>>> tau, p_value = stats.kendalltau(x1, x2)
>>> tau
-0.47140452079103173
>>> p_value
0.2827454599327748
"""
x = np.asarray(x).ravel()
y = np.asarray(y).ravel()
if x.size != y.size:
raise ValueError("All inputs to `kendalltau` must be of the same "
f"size, found x-size {x.size} and y-size {y.size}")
elif not x.size or not y.size:
# Return NaN if arrays are empty
return KendalltauResult(np.nan, np.nan)
# check both x and y
cnx, npx = _contains_nan(x, nan_policy)
cny, npy = _contains_nan(y, nan_policy)
contains_nan = cnx or cny
if npx == 'omit' or npy == 'omit':
nan_policy = 'omit'
if contains_nan and nan_policy == 'propagate':
return KendalltauResult(np.nan, np.nan)
elif contains_nan and nan_policy == 'omit':
x = ma.masked_invalid(x)
y = ma.masked_invalid(y)
if variant == 'b':
return mstats_basic.kendalltau(x, y, method=method, use_ties=True)
else:
raise ValueError("Only variant 'b' is supported for masked arrays")
if initial_lexsort is not None: # deprecate to drop!
warnings.warn('"initial_lexsort" is gone!')
def count_rank_tie(ranks):
cnt = np.bincount(ranks).astype('int64', copy=False)
cnt = cnt[cnt > 1]
return ((cnt * (cnt - 1) // 2).sum(),
(cnt * (cnt - 1.) * (cnt - 2)).sum(),
(cnt * (cnt - 1.) * (2*cnt + 5)).sum())
size = x.size
perm = np.argsort(y) # sort on y and convert y to dense ranks
x, y = x[perm], y[perm]
y = np.r_[True, y[1:] != y[:-1]].cumsum(dtype=np.intp)
# stable sort on x and convert x to dense ranks
perm = np.argsort(x, kind='mergesort')
x, y = x[perm], y[perm]
x = np.r_[True, x[1:] != x[:-1]].cumsum(dtype=np.intp)
dis = _kendall_dis(x, y) # discordant pairs
obs = np.r_[True, (x[1:] != x[:-1]) | (y[1:] != y[:-1]), True]
cnt = np.diff(np.nonzero(obs)[0]).astype('int64', copy=False)
ntie = (cnt * (cnt - 1) // 2).sum() # joint ties
xtie, x0, x1 = count_rank_tie(x) # ties in x, stats
ytie, y0, y1 = count_rank_tie(y) # ties in y, stats
tot = (size * (size - 1)) // 2
if xtie == tot or ytie == tot:
return KendalltauResult(np.nan, np.nan)
# Note that tot = con + dis + (xtie - ntie) + (ytie - ntie) + ntie
# = con + dis + xtie + ytie - ntie
con_minus_dis = tot - xtie - ytie + ntie - 2 * dis
con = tot - xtie - ytie + ntie
if variant == 'b':
tau = con_minus_dis / np.sqrt(tot - xtie) / np.sqrt(tot - ytie)
elif variant == 'c':
minclasses = min(len(set(x)), len(set(y)))
tau = 2*con_minus_dis / (size**2 * (minclasses-1)/minclasses)
elif variant == 'd':
tau = (con + ytie*0.5) / (con + dis + ytie)
else:
raise ValueError(f"Unknown variant of the method chosen: {variant}. "
"variant must be 'b' or 'c'.")
# Limit range to fix computational errors
tau = min(1., max(-1., tau))
# The p-value calculation is the same for all variants since the p-value
# depends only on con_minus_dis.
if method == 'exact' and (xtie != 0 or ytie != 0):
raise ValueError("Ties found, exact method cannot be used.")
if method == 'auto':
if (xtie == 0 and ytie == 0) and (size <= 33 or
min(dis, tot-dis) <= 1):
method = 'exact'
else:
method = 'asymptotic'
if xtie == 0 and ytie == 0 and method == 'exact':
pvalue = mstats_basic._kendall_p_exact(size, min(dis, tot-dis))
elif method == 'asymptotic':
# con_minus_dis is approx normally distributed with this variance [3]_
m = size * (size - 1.)
var = ((m * (2*size + 5) - x1 - y1) / 18 +
(2 * xtie * ytie) / m + x0 * y0 / (9 * m * (size - 2)))
pvalue = (special.erfc(np.abs(con_minus_dis) /
np.sqrt(var) / np.sqrt(2)))
else:
raise ValueError(f"Unknown method {method} specified. Use 'auto', "
"'exact' or 'asymptotic'.")
return tau