forked from h2oai/h2o-2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathh2o_summ.py
274 lines (233 loc) · 11 KB
/
h2o_summ.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
import h2o_print as h2p, h2o_util
import math, functools, getpass
#***************************************************************************
# similar to Wai Yip Tung's. a pure python percentile function
# so we don't have to use the one(s) from numpy or scipy
# and require those package installs
## {{{ http://code.activestate.com/recipes/511478/ (r1)
def percentileOnSortedList(N, percent, key=lambda x:x, interpolate='mean'):
# 5 ways of resolving fractional
# floor, ceil, funky, linear, mean
interpolateChoices = ['floor', 'ceil', 'funky', 'linear', 'mean']
if interpolate not in interpolateChoices:
print "Bad choice for interpolate:", interpolate
print "Supported choices:", interpolateChoices
"""
Find the percentile of a list of values.
@parameter N - is a list of values. Note N MUST BE already sorted.
@parameter percent - a float value from 0.0 to 1.0.
@parameter key - optional key function to compute value from each element of N.
@return - the percentile of the values
"""
if N is None:
return None
k = (len(N)-1) * percent
f = int(math.floor(k))
c = int(math.ceil(k))
if f == c:
d = key(N[f])
msg = "aligned:"
elif interpolate=='floor':
d = key(N[f])
msg = "fractional with floor:"
elif interpolate=='ceil':
d = key(N[c])
msg = "fractional with ceil:"
elif interpolate=='funky':
d0 = key(N[f]) * (c-k)
d1 = key(N[c]) * (k-f)
d = d0+d1
msg = "fractional with Tung(floor and ceil) :"
elif interpolate=='linear':
assert (c-f)==1
assert (k>=f) and (k<=c)
pctDiff = k-f
dDiff = pctDiff * (key(N[c]) - key(N[f]))
d = key(N[f] + dDiff)
msg = "fractional %s with linear(floor and ceil):" % pctDiff
elif interpolate=='mean':
d = (key(N[c]) + key(N[f])) / 2.0
msg = "fractional with mean(floor and ceil):"
# print 3 around the floored k, for eyeballing when we're close
flooredK = int(f)
# print the 3 around the median
if flooredK > 0:
print "prior->", key(N[flooredK-1]), " "
else:
print "prior->", "<bof>"
print "floor->", key(N[flooredK]), " ", msg, 'result:', d, "f:", f, "len(N):", len(N)
if flooredK+1 < len(N):
print " ceil->", key(N[flooredK+1]), "c:", c
else:
print " ceil-> <eof>", "c:", c
return d
#***************************************************************************
# median is 50th percentile.
def medianOnSortedList(N, key=lambda x:x):
median = percentileOnSortedlist(N, percent=0.5, key=key)
return median
#***************************************************************************
def percentileOnSortedList_25_50_75( N, key=lambda x:x):
three = (
percentileOnSortedlist(N, percent=0.25, key=key),
percentileOnSortedlist(N, percent=0.50, key=key),
percentileOnSortedlist(N, percent=0.75, key=key),
)
return three
#***************************************************************************
def quantile_comparisons(csvPathname, skipHeader=False, col=0, datatype='float',
h2oSummary2=None,
h2oSummary2MaxErr=None,
h2oQuantilesApprox=None, h2oQuantilesExact=None,
h2oExecQuantiles=None,
interpolate='linear', quantile=0.50, use_genfromtxt=False):
SCIPY_INSTALLED = True
try:
import scipy as sp
import numpy as np
print "Both numpy and scipy are installed. Will do extra checks"
except ImportError:
print "numpy or scipy is not installed. Will only do sort-based checking"
SCIPY_INSTALLED = False
if use_genfromtxt and SCIPY_INSTALLED:
print "Using numpy.genfromtxt. Better handling of null bytes"
target = np.genfromtxt(
open(csvPathname, 'r'),
delimiter=',',
skip_header=1 if skipHeader else 0,
dtype=None) # guess!
# print "shape:", target.shape()
else:
print "Using python csv reader"
target = h2o_util.file_read_csv_col(csvPathname, col=col, datatype=datatype,
skipHeader=skipHeader, preview=5)
if datatype=='float':
# to make irene's R runif files first col work (quoted row numbers, integers
#shouldn't hurt anyone else?
# strip " from left (ignore leading whitespace
# strip " from right (ignore leading whitespace
targetFP = map(float, target)
# targetFP= np.array(tFP, np.float)
if datatype=='int':
targetFP = map(int, target)
if SCIPY_INSTALLED:
# http://docs.scipy.org/doc/numpy-dev/reference/generated/numpy.percentile.html
# numpy.percentile has simple linear interpolate and midpoint
# need numpy 1.9 for interpolation. numpy 1.8 doesn't have
# p = np.percentile(targetFP, 50 if DO_MEDIAN else 99.9, interpolation='midpoint')
# 1.8
p = np.percentile(targetFP, quantile*100)
h2p.red_print("numpy.percentile", p)
# per = [100 * t for t in thresholds]
from scipy import stats
s1 = stats.scoreatpercentile(targetFP, quantile*100)
h2p.red_print("scipy stats.scoreatpercentile", s1)
# scipy apparently doesn't have the use of means (type 2)
# http://en.wikipedia.org/wiki/Quantile
# it has median (R-8) with 1/3, 1/3
if 1==0:
# type 6
alphap=0
betap=0
# type 5 okay but not perfect
alphap=0.5
betap=0.5
# type 8
alphap=1/3.0
betap=1/3.0
if interpolate=='mean':
# an approx? (was good when comparing to h2o type 2)
alphap=0.4
betap=0.4
if interpolate=='linear':
# this is type 7
alphap=1
betap=1
s2List = stats.mstats.mquantiles(targetFP, prob=quantile, alphap=alphap, betap=betap)
s2 = s2List[0]
# http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mstats.mquantiles.html
# type 7
# alphap=0.4, betap=0.4,
# type 2 not available? (mean)
# alphap=1/3.0, betap=1/3.0 is approx median?
h2p.red_print("scipy stats.mstats.mquantiles:", s2)
# also get the median with a painful sort (h2o_summ.percentileOnSortedlist()
# inplace sort
targetFP.sort()
# this matches scipy type 7 (linear)
# b = h2o_summ.percentileOnSortedList(targetFP, 0.50 if DO_MEDIAN else 0.999, interpolate='linear')
# this matches h2o type 2 (mean)
# b = h2o_summ.percentileOnSortedList(targetFP, 0.50 if DO_MEDIAN else 0.999, interpolate='mean')
b = percentileOnSortedList(targetFP, quantile, interpolate='linear')
label = str(quantile * 100) + '%'
h2p.blue_print(label, "from sort:", b)
if SCIPY_INSTALLED:
h2p.blue_print(label, "from numpy:", p)
h2p.blue_print(label, "from scipy 1:", s1)
h2p.blue_print(label, "from scipy 2:", s2)
h2p.blue_print(label, "from h2o summary:", h2oSummary2)
h2p.blue_print(label, "from h2o multipass:", h2oQuantilesExact)
h2p.blue_print(label, "from h2o singlepass:", h2oQuantilesApprox)
if h2oExecQuantiles:
h2p.blue_print(label, "from h2o exec:", h2oExecQuantiles)
# they should be identical. keep a tight absolute tolerance
# Note the comparisons have different tolerances, some are relative, some are absolute
if h2oQuantilesExact:
if math.isnan(float(h2oQuantilesExact)):
raise Exception("h2oQuantilesExact is unexpectedly NaN %s" % h2oQuantilesExact)
h2o_util.assertApproxEqual(h2oQuantilesExact, b, tol=0.0000002,
msg='h2o quantile multipass is not approx. same as sort algo')
if h2oQuantilesApprox:
# this can be NaN if we didn't calculate it. turn the NaN string into a float NaN
if math.isnan(float(h2oQuantilesApprox)):
raise Exception("h2oQuantilesApprox is unexpectedly NaN %s" % h2oQuantilesApprox)
if h2oSummary2MaxErr:
h2o_util.assertApproxEqual(h2oQuantilesApprox, b, tol=h2oSummary2MaxErr,
msg='h2o quantile singlepass is not approx. same as sort algo')
else:
h2o_util.assertApproxEqual(h2oQuantilesApprox, b, rel=0.1,
msg='h2o quantile singlepass is not approx. same as sort algo')
if h2oSummary2:
if math.isnan(float(h2oSummary2)):
raise Exception("h2oSummary2 is unexpectedly NaN %s" % h2oSummary2)
if h2oSummary2MaxErr:
# maxErr absolute was calculated in the test from 0.5*(max-min/(max_qbins-2))
h2o_util.assertApproxEqual(h2oSummary2, b, tol=h2oSummary2MaxErr,
msg='h2o summary2 is not approx. same as sort algo (calculated expected max error)')
else:
# bounds are way off, since it depends on the min/max of the col, not the expected value
h2o_util.assertApproxEqual(h2oSummary2, b, rel=1.0,
msg='h2o summary2 is not approx. same as sort algo (sloppy compare)')
if h2oQuantilesApprox and h2oSummary2:
# they should both get the same answer. Currently they have different code, but same algo
# FIX! ...changing to a relative tolerance, since we're getting a miscompare in some cases.
# not sure why..maybe some subtle algo diff.
h2o_util.assertApproxEqual(h2oSummary2, h2oQuantilesApprox, rel=0.04,
msg='h2o summary2 is not approx. same as h2o singlepass.'+\
' Check that max_qbins is 1000 (summary2 is fixed) and type 7 interpolation')
if h2oExecQuantiles:
if math.isnan(float(h2oExecQuantiles)):
raise Exception("h2oExecQuantiles is unexpectedly NaN %s" % h2oExecQuantiles)
# bounds are way off
h2o_util.assertApproxEqual(h2oExecQuantiles, b, rel=1.0,
msg='h2o summary2 is not approx. same as sort algo')
if SCIPY_INSTALLED:
if h2oQuantilesExact:
h2o_util.assertApproxEqual(h2oQuantilesExact, p, tol=0.0000002,
msg='h2o quantile multipass is not same as numpy.percentile')
h2o_util.assertApproxEqual(h2oQuantilesExact, s1, tol=0.0000002,
msg='h2o quantile multipass is not same as scipy stats.scoreatpercentile')
# give us some slack compared to the scipy use of median (instead of desired mean)
# since we don't have bounds here like above, just stop this test for now
if h2oQuantilesApprox and 1==0:
if interpolate=='mean':
h2o_util.assertApproxEqual(h2oQuantilesApprox, s2, rel=0.5,
msg='h2o quantile singlepass is not approx. same as scipy stats.mstats.mquantiles')
else:
h2o_util.assertApproxEqual(h2oQuantilesApprox, s2, rel=0.5,
msg='h2o quantile singlepass is not same as scipy stats.mstats.mquantiles')
# see if scipy changes. nope. it doesn't
if 1==0:
a = stats.mstats.mquantiles(targetFP, prob=quantile, alphap=alphap, betap=betap)
h2p.red_print("after sort")
h2p.red_print("scipy stats.mstats.mquantiles:", s3)