did anything change in treeStats? add more debug

AI-Cdrone · Oct 19, 2014 · 6517700 · 6517700
1 parent c3ba0b2
commit 6517700
Show file tree

Hide file tree

Showing 2 changed files with 64 additions and 62 deletions.
diff --git a/py/h2o_rf.py b/py/h2o_rf.py
@@ -149,6 +149,8 @@ def simpleCheckRFView(node=None, rfv=None, checkScoringOnly=False, noPrint=False
 
     varimp = rf_model['varimp']
     treeStats = rf_model['treeStats']
+    if not treeStats:
+        raise Exception("treeStats not right?: %s" % h2o.dump_json(treestats))
     # print "json:", h2o.dump_json(rfv)
     data_key = rf_model['_dataKey']
     model_key = rf_model['_key']

diff --git a/py/testdir_single_jvm/test_speedrf_covtype.py b/py/testdir_single_jvm/test_speedrf_covtype.py
@@ -22,11 +22,11 @@
     'seed': None,
     }
 
-DO_OOBE = False
-DO_PLOT = False
+DO_OOBE = True
+DO_PLOT = True
 # TRY = 'max_depth'
-# TRY = 'ntrees'
-TRY = 'nbins'
+TRY = 'ntrees'
+# TRY = 'nbins'
 
 
 class Basic(unittest.TestCase):
@@ -120,71 +120,71 @@ def test_speedrf_covtype_fvec(self):
 
                 start = time.time()
                 rfResult = h2o_cmd.runSpeeDRF(parseResult=parseTrainResult, timeoutSecs=timeoutSecs,
-                                         noPoll=True, **kwargs)
+                    noPoll=True, **kwargs)
                 trainElapsed = time.time() - start
                 print 'rf train end', i, 'on', csvTrainPathname, 'took', trainElapsed, 'seconds'
 
                 # don't cancel the last one
-                #if not lastOne:
-                #    time.sleep(1)
-                #    h2o_jobs.cancelAllJobs(timeoutSecs=2)
-                h2o_jobs.cancelAllJobs(timeoutSecs=2)
+                if not lastOne:
+                    time.sleep(1)
+                    h2o_jobs.cancelAllJobs(timeoutSecs=2)
+                # h2o_jobs.cancelAllJobs(timeoutSecs=2)
 
 
 
-            #### print "rfView", h2o.dump_json(rfView)
-            ##print "We have a result from the RF above, completed but didn't do RFView yet"
-            ## could the RF indicate 'done' too soon?
-            ## if rfResult['state']=='RUNNING':
-            ##    raise Exception("Why is this RF still in RUNNING state? %s" % h2o.dump_json(rfResult))
-
-            ## if 'drf_model' not in rfResult:
-            ##    raise Exception("How come there's no drf_model in this RF result? %s" % h2o.dump_json(rfResult))
-            #h2o_jobs.pollWaitJobs(timeoutSecs=300)
-            #rfView = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60)
-            #print "rfView:", h2o.dump_json(rfView)
-
-            #rfView["drf_model"] = rfView.pop("speedrf_model")
-            #rf_model = rfView['drf_model']
-            #cms = rf_model['cms']
-            #### print "cm:", h2o.dump_json(cm)
-            #ntrees = rf_model['N']
-            #errs = rf_model['errs']
-            #N = rf_model['N']
-            #varimp = rf_model['varimp']
-            #treeStats = rf_model['treeStats']
-
-            #print "maxDepth:", treeStats['maxDepth']
-            #print "maxLeaves:", treeStats['maxLeaves']
-            #print "minDepth:", treeStats['minDepth']
-            #print "minLeaves:", treeStats['minLeaves']
-            #print "meanLeaves:", treeStats['meanLeaves']
-            #print "meanDepth:", treeStats['meanDepth']
-            #print "errs[0]:", errs[0]
-            #print "errs[-1]:", errs[-1]
-            #print "errs:", errs
-
-            #(classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView)
-            ## we iterate over params, so can't really do this check
-            ## self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error)
-
-            #print "classErrorPctList:", classErrorPctList
-            #self.assertEqual(len(classErrorPctList), 7, "Should be 7 output classes, so should have 7 class error percentages from a reasonable predict")
-            ## FIX! should update this expected classification error
-            #predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=data_key)
-
-            #eList.append(classErrorPctList[4])
-            #fList.append(trainElapsed)
-            #if DO_PLOT:
-            #    if TRY == 'max_depth':
-            #        xLabel = 'max_depth'
-            #    elif TRY == 'ntrees':
-            #        xLabel = 'ntrees'
-            #    elif TRY == 'nbins':
-            #        xLabel = 'nbins'
-            #    else:
-            #        raise Exception("huh? %s" % TRY)
-            #    xList.append(paramDict[xLabel])
+            ### print "rfView", h2o.dump_json(rfView)
+            #print "We have a result from the RF above, completed but didn't do RFView yet"
+            # could the RF indicate 'done' too soon?
+            # if rfResult['state']=='RUNNING':
+            #    raise Exception("Why is this RF still in RUNNING state? %s" % h2o.dump_json(rfResult))
+
+            # if 'drf_model' not in rfResult:
+            #    raise Exception("How come there's no drf_model in this RF result? %s" % h2o.dump_json(rfResult))
+            h2o_jobs.pollWaitJobs(timeoutSecs=300)
+            rfView = h2o_cmd.runSpeeDRFView(None, model_key, timeoutSecs=60)
+            print "rfView:", h2o.dump_json(rfView)
+
+            rfView["drf_model"] = rfView.pop("speedrf_model")
+            rf_model = rfView['drf_model']
+            cms = rf_model['cms']
+            ### print "cm:", h2o.dump_json(cm)
+            ntrees = rf_model['N']
+            errs = rf_model['errs']
+            N = rf_model['N']
+            varimp = rf_model['varimp']
+            treeStats = rf_model['treeStats']
+
+            print "maxDepth:", treeStats['maxDepth']
+            print "maxLeaves:", treeStats['maxLeaves']
+            print "minDepth:", treeStats['minDepth']
+            print "minLeaves:", treeStats['minLeaves']
+            print "meanLeaves:", treeStats['meanLeaves']
+            print "meanDepth:", treeStats['meanDepth']
+            print "errs[0]:", errs[0]
+            print "errs[-1]:", errs[-1]
+            print "errs:", errs
+
+            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView)
+            # we iterate over params, so can't really do this check
+            # self.assertAlmostEqual(classification_error, 0.03, delta=0.5, msg="Classification error %s differs too much" % classification_error)
+
+            print "classErrorPctList:", classErrorPctList
+            self.assertEqual(len(classErrorPctList), 7, "Should be 7 output classes, so should have 7 class error percentages from a reasonable predict")
+            # FIX! should update this expected classification error
+            predict = h2o.nodes[0].generate_predictions(model_key=model_key, data_key=data_key)
+
+            eList.append(classErrorPctList[4])
+            fList.append(trainElapsed)
+            if DO_PLOT:
+                if TRY == 'max_depth':
+                    xLabel = 'max_depth'
+                elif TRY == 'ntrees':
+                    xLabel = 'ntrees'
+                elif TRY == 'nbins':
+                    xLabel = 'nbins'
+                else:
+                    raise Exception("huh? %s" % TRY)
+                xList.append(paramDict[xLabel])
 
         if DO_PLOT:
             eLabel = 'class 4 pctWrong'