fully working fvec scripts

spennihana · spennihana · commit 197d31c53f3c · 2013-10-20T00:31:37.000-07:00
diff --git a/bench/BMscripts/gbmBench.py b/bench/BMscripts/gbmBench.py
@@ -4,7 +4,7 @@
 sys.path.extend(['.','..'])
 import h2o_cmd, h2o, h2o_hosts, h2o_browse as h2b, h2o_import as h2i, h2o_rf, h2o_jobs
 
-csv_header = ('h2o_build','nMachines','nJVMs','Xmx/JVM','dataset','nTrainRows','nTestRows','nCols','trainParseWallTime','classification','gbmBuildTime')
+csv_header = ('h2o_build','nMachines','nJVMs','Xmx/JVM','dataset','nTrainRows','nTestRows','nCols','trainParseWallTime','classification','gbmBuildTime','Error')
 
 files      = {'Airlines'    : {'train': ('AirlinesTrain1x', 'AirlinesTrain10x', 'AirlinesTrain100x'),         'test' : 'AirlinesTest'},
               'AllBedrooms': {'train': ('AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x'), 'test' : 'AllBedroomsTest'},
@@ -33,7 +33,7 @@ def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, dep
                         dialect='excel', extrasaction='ignore',delimiter=',')
         try:
             java_heap_GB = h2o.nodes[0].java_heap_GB
-            importFolderPath = bench + folderPath
+            importFolderPath = bench + "/" + folderPath
             if (f in ['AirlinesTrain1x','AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x','CovTypeTrain1x', 'CovTypeTrain10x', 'CovTypeTrain100x']): 
                 csvPathname = importFolderPath + "/" + f + '.csv'
             else: 
@@ -44,6 +44,7 @@ def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, dep
             h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
             headerKey = h2i.find_key(hK)
             trainParseWallStart = time.time()
+            if f in (['AirlinesTrain10x', 'AirlinesTrain100x']): h2o.beta_features = False #regex parsing acting weird when not using browser, use VA -> FVEC converter
             parseResult = h2i.import_parse(bucket           = 'home-0xdiag-datasets',
                                            path             = csvPathname,
                                            schema           = 'local',
@@ -56,12 +57,12 @@ def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, dep
                                            pollTimeoutSecs  = 7200,
                                            noPoll           = True,
                                            doSummary        = False
-                                          )             
+                                          )
             h2o_jobs.pollWaitJobs(timeoutSecs=7200, pollTimeoutSecs=7200, retryDelaySecs=5)
             parseWallTime = time.time() - trainParseWallStart
             print "Parsing training file took ", parseWallTime ," seconds." 
-        
-            inspect_train  = h2o.nodes[0].inspect(parseResult['destination_key'])
+            h2o.beta_features = True
+            inspect_train  = h2o.nodes[0].inspect(hex_key)
             inspect_test   = h2o.nodes[0].inspect(testFilehex)
             
             nMachines = 1 if len(h2o_hosts.hosts) is 0 else len(h2o_hosts.hosts)
@@ -88,7 +89,8 @@ def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, dep
                          'nbins'                : nbins,
                          'learn_rate'           : learnRate,
                         }
-
+    
+            parseResult = {'destination_key' : hex_key}
             kwargs    = params.copy()
             gbmStart  = time.time()
             #TODO(spencer): Uses jobs to poll for gbm completion
@@ -97,10 +99,13 @@ def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, dep
             gbmTime   = time.time() - gbmStart
             row.update( {'gbmBuildTime'       : gbmTime,
                         })
-            #TODO(spencer): Add in gbm scoring
-            #gbmScoreStart = time.time()
-            #gbmScore      = h2o_cmd.runGLMScore(key=testFilehex,model_key=params['destination_key'])
-            #scoreTime     = time.time() - gbmScoreStart
+            gbmTrainView = h2o_cmd.runGBMView(model_key='GBM('+f+')')
+            if classification:
+                cm = gbmTrainView['gbm_model']['cm']
+                err = 1.0*(cm[0][1] + cm[1][0]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1])
+            else:
+                err = gbmTrainView['gbm_model']['errs'][-1]
+            row.update({'Error' : err})
             csvWrt.writerow(row)
         finally:
             output.close()
@@ -129,7 +134,7 @@ def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, dep
     doGBM(files['Airlines'], folderPath='Airlines', 
             ignored_cols    = ignored, 
             classification  = 1,
-            testFilehex     = testFile['destination_key'], 
+            testFilehex     = 'atest.hex',
             ntrees          = 100,
             depth           = 5,
             minrows         = 10,
@@ -172,14 +177,14 @@ def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, dep
     headerKey                   = h2i.find_key(hK)
     testFile                    = h2i.import_parse(bucket='home-0xdiag-datasets', path=bench+'/AllBedrooms/AllBedroomsTest.csv', schema='local', hex_key="allBTest.hex", header=1, header_from_file=headerKey, separator=44,noPoll=True,doSummary=False)
     h2o_jobs.pollWaitJobs(timeoutSecs=7200, pollTimeoutSecs=7200, retryDelaySecs=5)
-    elapsedAllBedroomsParse = time.time() - allBedroomsTestParseStart
+    elapsedAllBedroomsTestParse = time.time() - allBedroomsTestParseStart
     row = {'testParseWallTime' : elapsedAllBedroomsTestParse}
     response = 'medrent'
     ignored  = None
-    doGBM(files['AllBedroom'], folderPath='AllBedrooms',
+    doGBM(files['AllBedrooms'], folderPath='AllBedrooms',
             ignored_cols    = ignored,
             classification  = 0,
-            testFilehex     = testFile['destination_key'],
+            testFilehex     = "allBTest.hex",
             ntrees          = 100,
             depth           = 5,
             minrows         = 10,
diff --git a/bench/BMscripts/glm2Bench.py b/bench/BMscripts/glm2Bench.py
@@ -5,7 +5,7 @@
 import h2o_cmd, h2o, h2o_hosts, h2o_browse as h2b, h2o_import as h2i, h2o_rf, h2o_jobs
 from pprint import pprint
 
-csv_header = ('h2o_build','java_heap_GB','dataset','nTrainRows','nTestRows','nCols','trainParseWallTime','nfolds','glmBuildTime','testParseWallTime','scoreTime','AUC','AIC','error')
+csv_header = ('h2o_build','java_heap_GB','dataset','nTrainRows','nTestRows','nCols','nPredictors','trainParseWallTime','nfolds','glmBuildTime','testParseWallTime','nIterations','AUC','AIC','AverageError')
 
 files      = {'Airlines'    : {'train': ('AirlinesTrain1x', 'AirlinesTrain10x', 'AirlinesTrain100x'),          'test' : 'AirlinesTest'},
               'AllBedrooms' : {'train': ('AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x'), 'test' : 'AllBedroomsTest'},
@@ -36,16 +36,16 @@ def doGLM2(fs, folderPath, family, lambda_, alpha, nfolds, y, x, testFilehex, ro
             if (f in ['AirlinesTrain1x','AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x']): 
                 csvPathname = importFolderPath + "/" + f + '.csv'
             else:
-                print "Not doing Airlines10x and 100x for Parse2, regex seems to be broken..." 
-                continue
-                #csvPathname = importFolderPath + "/" + f + "/*linked*"
+                #print "Not doing Airlines10x and 100x for Parse2, regex seems to be broken..." 
+                #continue
+                csvPathname = importFolderPath + "/" + f + "/*"
             hex_key         = f + '.hex'
             hK              = folderPath + "Header.csv"    
             headerPathname  = importFolderPath + "/" + hK
             h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
             headerKey       = h2i.find_key(hK)
             trainParseWallStart = time.time()
-            h2o.beta_features=True
+            if f in (['AirlinesTrain10x', 'AirlinesTrain100x']): h2o.beta_features = False #regex parsing acting weird when not using browser, use VA -> FVEC converter
             parseResult = h2i.import_parse(bucket           = 'home-0xdiag-datasets',
                                            path             = csvPathname,
                                            schema           = 'local',
@@ -63,7 +63,7 @@ def doGLM2(fs, folderPath, family, lambda_, alpha, nfolds, y, x, testFilehex, ro
             parseResult = {'destination_key':hex_key}
             parseWallTime = time.time() - trainParseWallStart
             print "Parsing training file took ", parseWallTime ," seconds." 
-            
+            h2o.beta_features = True
             inspect_train  = h2o.nodes[0].inspect(hex_key)
             inspect_test   = h2o.nodes[0].inspect(testFilehex)
             
@@ -97,24 +97,27 @@ def doGLM2(fs, folderPath, family, lambda_, alpha, nfolds, y, x, testFilehex, ro
             row.update( {'glmBuildTime'       : glmTime,
                          #'AverageErrorOver10Folds'    : glm['glm_model']['validations'][0]['err'],
                         })
+            #if "Bedrooms" in f: 
+                #print "Sleeping 30"
+                #time.sleep(30)
             glmView = h2o_cmd.runGLMView(modelKey = "GLM("+f+")", timeoutSecs=380)
-            pprint(glmView)
 
             #glmScoreStart = time.time()
             #glmScore      = h2o_cmd.runGLMScore(key=testFilehex,model_key=params['destination_key'])
             #scoreTime     = time.time() - glmScoreStart
-            #if family == "binomial":
-            #    row.update( {'scoreTime'          : scoreTime,
-            #                 'AUC'                : glmScore['validation']['auc'],
-            #                 'AIC'                : glmScore['validation']['aic'],
-            #                 'error'              : glmScore['validation']['err'],
-            #                })
-            #else:
-            #    row.update( {'scoreTime'          : scoreTime,
-            #                 'AIC'                : glmScore['validation']['aic'],
-            #                 'AUC'                : 'NA',
-            #                 'error'              : glmScore['validation']['err'],
-            #                })
+            row.update( {'AIC'          : glmView['glm_model']['validation']['aic'],
+                         'nIterations'  : glmView['glm_model']['iteration'],
+                         'nPredictors'  : len(glmView['glm_model']['beta']),
+                         'AverageError' : glmView['glm_model']['validation']['avg_err'],
+                        })
+            if family == "binomial":
+                row.update( {#'scoreTime'          : scoreTime,
+                             'AUC'                : glmView['glm_model']['validation']['auc'],
+                            })
+            else:
+                row.update( {#'scoreTime'          : scoreTime,
+                             'AUC'                : 'NA',
+                            })
             csvWrt.writerow(row)
         finally:
             output.close()