Skip to content

Commit 48a5e0e

Browse files
author
Kevin Normoyle
committed
libsvm -> kmeans seems to be changing the last col of the dataset from int to enum and changing the data (to predicted cluster ids?)
added additional type checking with inspect and summary to isolate when the dataset changes
1 parent 4d5b1ef commit 48a5e0e

File tree

2 files changed

+49
-13
lines changed

2 files changed

+49
-13
lines changed

py/h2o_cmd.py

+5
Original file line numberDiff line numberDiff line change
@@ -350,12 +350,15 @@ def infoFromSummary(summaryResult, noPrint=False, numCols=None, numRows=None):
350350
if 1==0 and numCols and (len(summaries)!=numCols):
351351
raise Exception("Expected numCols: %s cols in summary. Got %s" % (numCols, len(summaries)))
352352

353+
coltypeList = []
353354
for column in summaries:
354355
colname = column['colname']
356+
# is this always None? unused?
355357
coltype = column['type']
356358
nacnt = column['nacnt']
357359
stats = column['stats']
358360
stattype = stats['type']
361+
coltypeList.append(stattype)
359362
h2o_exec.checkForBadFP(nacnt, 'nacnt for colname: %s stattype: %s' % (colname, stattype))
360363

361364
if stattype == 'Enum':
@@ -419,6 +422,8 @@ def infoFromSummary(summaryResult, noPrint=False, numCols=None, numRows=None):
419422
print "hbrk:", hbrk
420423
print "hcnt:", hcnt
421424

425+
return coltypeList
426+
422427
def dot():
423428
sys.stdout.write('.')
424429
sys.stdout.flush()

py/testdir_multi_jvm/test_KMeans_libsvm_fvec.py

+44-13
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
import random, sys, time, os
33
sys.path.extend(['.','..','../..','py'])
44
import h2o, h2o_cmd, h2o_browse as h2b, h2o_import as h2i, h2o_kmeans
5-
65
class Basic(unittest.TestCase):
76
def tearDown(self):
87
h2o.check_sandbox_for_errors()
@@ -16,10 +15,41 @@ def setUpClass(cls):
1615
@classmethod
1716
def tearDownClass(cls):
1817
# wait while I inspect things
19-
# time.sleep(1500)
18+
# h2o.sleep(1500)
2019
h2o.tear_down_cloud()
2120

2221
def test_KMeans_libsvm_fvec(self):
22+
23+
# hack this into a function so we can call it before and after kmeans
24+
# kmeans is changing the last col to enum?? (and changing the data)
25+
def do_summary_and_inspect():
26+
# SUMMARY******************************************
27+
summaryResult = h2o_cmd.runSummary(key=hex_key)
28+
coltypeList = h2o_cmd.infoFromSummary(summaryResult)
29+
30+
# INSPECT******************************************
31+
inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
32+
h2o_cmd.infoFromInspect(inspect, csvFilename)
33+
34+
numRows = inspect['numRows']
35+
numCols = inspect['numCols']
36+
37+
# Now check both inspect and summary
38+
if csvFilename=='covtype.binary.svm':
39+
for k in range(55):
40+
naCnt = inspect['cols'][k]['naCnt']
41+
self.assertEqual(0, naCnt, msg='col %s naCnt %d should be %s' % (k, naCnt, 0))
42+
stype = inspect['cols'][k]['type']
43+
print k, stype
44+
self.assertEqual('Int', stype, msg='col %s type %s should be %s' % (k, stype, 'Int'))
45+
46+
# summary may report type differently than inspect..check it too!
47+
# we could check na here too
48+
for i,c in enumerate(coltypeList):
49+
print "column index: %s column type: %s" % (i, c)
50+
# inspect says 'int?"
51+
assert c=='Numeric', "All cols in covtype.binary.svm should be parsed as Numeric! %s %s" % (i,c)
52+
2353
# just do the import folder once
2454
# make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
2555
# so probably 10x that for covtype200
@@ -42,8 +72,12 @@ def test_KMeans_libsvm_fvec(self):
4272
("syn_0_100_1000.svm", "cL", 30, 1),
4373
]
4474

75+
csvFilenameList = [
76+
("covtype.binary.svm", "cC", 30, 1),
77+
]
78+
4579
### csvFilenameList = random.sample(csvFilenameAll,1)
46-
# h2b.browseTheCloud()
80+
h2b.browseTheCloud()
4781
lenNodes = len(h2o.nodes)
4882

4983
firstDone = False
@@ -55,16 +89,9 @@ def test_KMeans_libsvm_fvec(self):
5589
# PARSE******************************************
5690
# creates csvFilename.hex from file in importFolder dir
5791
parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname,
58-
hex_key=hex_key, timeoutSecs=2000)
59-
print "Parse result['destination_key']:", parseResult['destination_key']
92+
hex_key=hex_key, timeoutSecs=2000, doSummary=False)
6093

61-
# INSPECT******************************************
62-
start = time.time()
63-
inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
64-
print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
65-
h2o_cmd.infoFromInspect(inspect, csvFilename)
66-
numRows = inspect['numRows']
67-
numCols = inspect['numCols']
94+
do_summary_and_inspect()
6895

6996
# KMEANS******************************************
7097
for trial in range(1):
@@ -87,12 +114,16 @@ def test_KMeans_libsvm_fvec(self):
87114
elapsed = time.time() - start
88115
print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
89116
"%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
117+
118+
do_summary_and_inspect()
119+
90120
# this does an inspect of the model and prints the clusters
91121
h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
92122

123+
print "hello"
93124
(centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)
94125

95-
126+
do_summary_and_inspect()
96127

97128
if __name__ == '__main__':
98129
h2o.unit_main()

0 commit comments

Comments
 (0)