libsvm -> kmeans seems to be changing the last col of the dataset from int to enum and changing the data (to predicted cluster ids?)

Kevin Normoyle · Kevin Normoyle · commit 48a5e0e06dfc · 2014-12-16T13:16:07.000-08:00
added additional type checking with inspect and summary to isolate when the dataset changes
diff --git a/py/h2o_cmd.py b/py/h2o_cmd.py
@@ -350,12 +350,15 @@ def infoFromSummary(summaryResult, noPrint=False, numCols=None, numRows=None):
     if 1==0 and numCols and (len(summaries)!=numCols):
         raise Exception("Expected numCols: %s cols in summary. Got %s" % (numCols, len(summaries)))
 
+    coltypeList = []
     for column in summaries:
         colname = column['colname']
+        # is this always None? unused?
         coltype = column['type']
         nacnt = column['nacnt']
         stats = column['stats']
         stattype = stats['type']
+        coltypeList.append(stattype)
         h2o_exec.checkForBadFP(nacnt, 'nacnt for colname: %s stattype: %s' % (colname, stattype))
 
         if stattype == 'Enum':
@@ -419,6 +422,8 @@ def infoFromSummary(summaryResult, noPrint=False, numCols=None, numRows=None):
             print "hbrk:", hbrk
             print "hcnt:", hcnt
 
+    return coltypeList
+
 def dot():
     sys.stdout.write('.')
     sys.stdout.flush()
diff --git a/py/testdir_multi_jvm/test_KMeans_libsvm_fvec.py b/py/testdir_multi_jvm/test_KMeans_libsvm_fvec.py
@@ -2,7 +2,6 @@
 import random, sys, time, os
 sys.path.extend(['.','..','../..','py'])
 import h2o, h2o_cmd, h2o_browse as h2b, h2o_import as h2i, h2o_kmeans
-
 class Basic(unittest.TestCase):
     def tearDown(self):
         h2o.check_sandbox_for_errors()
@@ -16,10 +15,41 @@ def setUpClass(cls):
     @classmethod
     def tearDownClass(cls):
         # wait while I inspect things
-        # time.sleep(1500)
+        # h2o.sleep(1500)
         h2o.tear_down_cloud()
 
     def test_KMeans_libsvm_fvec(self):
+
+        # hack this into a function so we can call it before and after kmeans
+        # kmeans is changing the last col to enum?? (and changing the data)
+        def do_summary_and_inspect():
+            # SUMMARY******************************************
+            summaryResult = h2o_cmd.runSummary(key=hex_key)
+            coltypeList = h2o_cmd.infoFromSummary(summaryResult)
+
+            # INSPECT******************************************
+            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
+            h2o_cmd.infoFromInspect(inspect, csvFilename)
+
+            numRows = inspect['numRows']
+            numCols = inspect['numCols']
+
+            # Now check both inspect and summary
+            if csvFilename=='covtype.binary.svm':
+                for k in range(55):
+                    naCnt = inspect['cols'][k]['naCnt']
+                    self.assertEqual(0, naCnt, msg='col %s naCnt %d should be %s' % (k, naCnt, 0))
+                    stype = inspect['cols'][k]['type']
+                    print k, stype
+                    self.assertEqual('Int', stype, msg='col %s type %s should be %s' % (k, stype, 'Int'))
+
+                # summary may report type differently than inspect..check it too!
+                # we could check na here too
+                for i,c in enumerate(coltypeList):
+                    print "column index: %s  column type: %s" % (i, c)
+                    # inspect says 'int?"
+                    assert c=='Numeric', "All cols in covtype.binary.svm should be parsed as Numeric! %s %s" % (i,c)
+
         # just do the import folder once
         # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
         # so probably 10x that for covtype200
@@ -42,8 +72,12 @@ def test_KMeans_libsvm_fvec(self):
             ("syn_0_100_1000.svm", "cL", 30, 1),
         ]
 
+        csvFilenameList = [
+            ("covtype.binary.svm", "cC", 30, 1),
+        ]
+
         ### csvFilenameList = random.sample(csvFilenameAll,1)
-        # h2b.browseTheCloud()
+        h2b.browseTheCloud()
         lenNodes = len(h2o.nodes)
 
         firstDone = False
@@ -55,16 +89,9 @@ def test_KMeans_libsvm_fvec(self):
             # PARSE******************************************
             # creates csvFilename.hex from file in importFolder dir 
             parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, 
-                hex_key=hex_key, timeoutSecs=2000)
-            print "Parse result['destination_key']:", parseResult['destination_key']
+                hex_key=hex_key, timeoutSecs=2000, doSummary=False)
 
-            # INSPECT******************************************
-            start = time.time()
-            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
-            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
-            h2o_cmd.infoFromInspect(inspect, csvFilename)
-            numRows = inspect['numRows']
-            numCols = inspect['numCols']
+            do_summary_and_inspect()
 
             # KMEANS******************************************
             for trial in range(1):
@@ -87,12 +114,16 @@ def test_KMeans_libsvm_fvec(self):
                 elapsed = time.time() - start
                 print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                     "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
+
+                do_summary_and_inspect()
+
                 # this does an inspect of the model and prints the clusters
                 h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
 
+                print "hello"
                 (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)
 
-
+                do_summary_and_inspect()
 
 if __name__ == '__main__':
     h2o.unit_main()