forked from h2oai/h2o-2
-
Notifications
You must be signed in to change notification settings - Fork 1
/
h2o_kmeans.py
189 lines (160 loc) · 8.33 KB
/
h2o_kmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
import h2o_cmd
import h2o_nodes
import re, math, random
from h2o_test import check_sandbox_for_errors
def pickRandKMeansParams(paramDict, params):
randomGroupSize = random.randint(1,len(paramDict))
for i in range(randomGroupSize):
randomKey = random.choice(paramDict.keys())
randomV = paramDict[randomKey]
randomValue = random.choice(randomV)
params[randomKey] = randomValue
def simpleCheckKMeans(self, kmeans, **kwargs):
warnings = None
if 'warnings' in kmeans:
warnings = kmeans['warnings']
# catch the 'Failed to converge" for now
x = re.compile("[Ff]ailed")
for w in warnings:
print "\nwarning:", w
if re.search(x,w): raise Exception(w)
# Check other things in the json response dictionary 'kmeans' here
destination_key = kmeans['model']['_key']
# Exception: rjson error in inspect: Argument 'src_key' error: benign_k.hex:Key is not a Frame
# can't use inspect on a model key? now?
kmeansResult = kmeans
model = kmeansResult['model']
centers = model["centers"]
size = model["size"]
cluster_variances = model["within_cluster_variances"]
error = model["total_within_SS"]
iterations = model["iterations"]
normalized = model["normalized"]
max_iter = model["max_iter"]
for i,c in enumerate(centers):
for n in c:
if math.isnan(float(n)):
raise Exception("center", i, "has NaN:", n, "center:", c)
# shouldn't have any errors
check_sandbox_for_errors()
return warnings
def bigCheckResults(self, kmeans, csvPathname, parseResult, predictKey, **kwargs):
simpleCheckKMeans(self, kmeans, **kwargs)
# can't use inspect on a model key? now?
model = kmeans['model']
model_key = model['_key']
centers = model['centers']
size = model["size"]
cluster_variances = model["within_cluster_variances"]
error = model["total_within_SS"]
iterations = model["iterations"]
normalized = model["normalized"]
max_iter = model["max_iter"]
kmeansResult = kmeans
predictResult = h2o_nodes.nodes[0].generate_predictions(data_key=parseResult['destination_key'], model_key=model_key, destination_key=predictKey)
summaryResult = h2o_nodes.nodes[0].summary_page(key=predictKey, timeoutSecs=120)
hcnt = summaryResult['summaries'][0]['hcnt'] # histogram
rows_per_cluster = hcnt
# FIX! does the cluster order/naming match, compared to cluster variances
sqr_error_per_cluster = cluster_variances
tupleResultList = []
print "\nerror: ", error
if (len(centers)!=len(rows_per_cluster) or len(centers)!=len(sqr_error_per_cluster)):
raise Exception("centers, rows_per_cluster, sqr_error_per_cluster should all be same length %s, %s, %s" % \
(len(centers), len(rows_per_cluster), len(sqr_error_per_cluster)))
print "Did iterations: %s given max_iter: %s" % (iterations, max_iter)
for i,c in enumerate(centers):
print "\ncenters[%s]: " % i, [round(c,2) for c in centers[i]]
print "rows_per_cluster[%s]: " % i, rows_per_cluster[i]
print "sqr_error_per_cluster[%s]: " % i, sqr_error_per_cluster[i]
print "size[%s]:" % i, size[i]
tupleResultList.append( (centers[i], rows_per_cluster[i], sqr_error_per_cluster[i]) )
return (centers, tupleResultList)
# list of tuples: center, rows, sqr_error
# expected = [ # tupleResultList is returned by bigCheckResults like this
# ([-2.2824436059344264, -0.9572469619836067], 61, 71.04484889371177),
# ([0.04072444664179102, 1.738305108029851], 67, 118.83608173427331),
# ([2.7300104405999996, -1.16148755108], 50, 68.67496427685141)
# ]
# delta is a tuple of multipliers against the tupleResult for abs delta
# allowedDelta = (0.01, 0.1, 0.01)
def compareResultsToExpected(self, tupleResultList, expected=None, allowedDelta=None, allowError=False, allowRowError=False, trial=0):
# sort the tuple list by center for the comparison. (this will be visible to the caller?)
from operator import itemgetter
tupleResultList.sort(key=itemgetter(0))
if expected is not None:
# sort expected, just in case, for the comparison
expected.sort(key=itemgetter(0))
print "\nTrial #%d Expected:" % trial
for e in expected:
print e
# now compare to expected, with some delta allowed
print "\nTrial #%d Actual:" % trial
for t in tupleResultList:
print t, "," # so can cut and paste and put results in an expected = [..] list
if expected is not None and not allowError: # allowedDelta must exist if expected exists
for i, (expCenter, expRows, expError) in enumerate(expected):
(actCenter, actRows, actError) = tupleResultList[i]
for (a,b) in zip(expCenter, actCenter): # compare list of floats
absAllowedDelta = abs(allowedDelta[0] * a)
self.assertAlmostEqual(a, b, delta=absAllowedDelta,
msg="Trial %d Center value expected: %s actual: %s delta > %s" % (trial, a, b, absAllowedDelta))
if not allowRowError: # allow error in row count?
absAllowedDelta = abs(allowedDelta[1] * expRows)
self.assertAlmostEqual(expRows, actRows, delta=absAllowedDelta,
msg="Trial %d Rows expected: %s actual: %s delta > %s" % (trial, expRows, actRows, absAllowedDelta))
# fix, we don't compare the actual error # (what is it?)
# just print info on the distribution
def showClusterDistribution(self, tupleResultList, expected=None, allowedDelta=None, allowError=False, trial=0):
# sort the tuple list by center for the comparison. (this will be visible to the caller?)
from operator import itemgetter
if expected is not None:
# sort expected, just in case, for the comparison
expected.sort(key=itemgetter(0))
# get total row and total error
totalRows = 0
totalError = 0
print "\nExpected distribution, rows and error:"
for i, (expCenter, expRows, expError) in enumerate(expected):
totalRows += expRows
totalError += expError
# now go thru again and print percentages
print "totalRows:", totalRows, "totalError:", totalError
for i, (expCenter, expRows, expError) in enumerate(expected):
print expCenter, "pctRows: %0.2f" % (expRows/(totalRows+0.0)), "pctError: %0.2f" % (expError/(totalError+0.0))
if tupleResultList is not None:
tupleResultList.sort(key=itemgetter(0))
totalRows = 0
totalError = 0
print "\nActual distribution, rows and error:"
for i, (actCenter, actRows, actError) in enumerate(tupleResultList):
totalRows += actRows
totalError += actError
# now go thru again and print percentages
print "totalRows:", totalRows, "totalError:", totalError
for i, (actCenter, actRows, actError) in enumerate(tupleResultList):
print actCenter, "pctRows: %0.2f" % (actRows/(totalRows+0.0)), "pctError: %0.2f" % (actError/(totalError+0.0))
# compare this clusters to last one. since the files are concatenations,
# the results should be similar? 10% of first is allowed delta
def compareToFirstKMeans(self, clusters, firstclusters):
# clusters could be a list or not. if a list, don't want to create list of that list
# so use extend on an empty list. covers all cases?
if type(clusters) is list:
kList = clusters
firstkList = firstclusters
elif type(clusters) is dict:
raise Exception("compareToFirstKMeans: Not expecting dict for " + key)
else:
kList = [clusters]
firstkList = [firstclusters]
print "kList:", kList, "firstkList:", firstkList
for k, firstk in zip(kList, firstkList):
# delta must be a positive number?
# too bad we can't do an assertAlmostEqual on the list directly..have to break them out
for k1, firstk1 in zip(k, firstk):
delta = .1 * abs(float(firstk1))
print "k1:", k1, "firstk1:", firstk1
msg = "Too large a delta (>" + str(delta) + ") comparing current and first clusters: " + \
str(float(k1)) + ", " + str(float(firstk1))
self.assertAlmostEqual(float(k1), float(firstk1), delta=delta, msg=msg)
self.assertGreaterEqual(abs(float(k1)), 0.0, str(k1) + " abs not >= 0.0 in current")