4
4
sys .path .extend (['.' ,'..' ])
5
5
import h2o_cmd , h2o , h2o_hosts , h2o_browse as h2b , h2o_import as h2i , h2o_rf , h2o_jobs
6
6
7
- csv_header = ('h2o_build' ,'nMachines' ,'nJVMs' ,'Xmx/JVM' ,'dataset' ,'nTrainRows' ,'nTestRows' ,'nCols' ,'trainParseWallTime' ,'classification' ,'gbmBuildTime' )
7
+ csv_header = ('h2o_build' ,'nMachines' ,'nJVMs' ,'Xmx/JVM' ,'dataset' ,'nTrainRows' ,'nTestRows' ,'nCols' ,'trainParseWallTime' ,'classification' ,'gbmBuildTime' , 'Error' )
8
8
9
9
files = {'Airlines' : {'train' : ('AirlinesTrain1x' , 'AirlinesTrain10x' , 'AirlinesTrain100x' ), 'test' : 'AirlinesTest' },
10
10
'AllBedrooms' : {'train' : ('AllBedroomsTrain1x' , 'AllBedroomsTrain10x' , 'AllBedroomsTrain100x' ), 'test' : 'AllBedroomsTest' },
@@ -33,7 +33,7 @@ def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, dep
33
33
dialect = 'excel' , extrasaction = 'ignore' ,delimiter = ',' )
34
34
try :
35
35
java_heap_GB = h2o .nodes [0 ].java_heap_GB
36
- importFolderPath = bench + folderPath
36
+ importFolderPath = bench + "/" + folderPath
37
37
if (f in ['AirlinesTrain1x' ,'AllBedroomsTrain1x' , 'AllBedroomsTrain10x' , 'AllBedroomsTrain100x' ,'CovTypeTrain1x' , 'CovTypeTrain10x' , 'CovTypeTrain100x' ]):
38
38
csvPathname = importFolderPath + "/" + f + '.csv'
39
39
else :
@@ -44,6 +44,7 @@ def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, dep
44
44
h2i .import_only (bucket = 'home-0xdiag-datasets' , path = headerPathname )
45
45
headerKey = h2i .find_key (hK )
46
46
trainParseWallStart = time .time ()
47
+ if f in (['AirlinesTrain10x' , 'AirlinesTrain100x' ]): h2o .beta_features = False #regex parsing acting weird when not using browser, use VA -> FVEC converter
47
48
parseResult = h2i .import_parse (bucket = 'home-0xdiag-datasets' ,
48
49
path = csvPathname ,
49
50
schema = 'local' ,
@@ -56,12 +57,12 @@ def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, dep
56
57
pollTimeoutSecs = 7200 ,
57
58
noPoll = True ,
58
59
doSummary = False
59
- )
60
+ )
60
61
h2o_jobs .pollWaitJobs (timeoutSecs = 7200 , pollTimeoutSecs = 7200 , retryDelaySecs = 5 )
61
62
parseWallTime = time .time () - trainParseWallStart
62
63
print "Parsing training file took " , parseWallTime ," seconds."
63
-
64
- inspect_train = h2o .nodes [0 ].inspect (parseResult [ 'destination_key' ] )
64
+ h2o . beta_features = True
65
+ inspect_train = h2o .nodes [0 ].inspect (hex_key )
65
66
inspect_test = h2o .nodes [0 ].inspect (testFilehex )
66
67
67
68
nMachines = 1 if len (h2o_hosts .hosts ) is 0 else len (h2o_hosts .hosts )
@@ -88,7 +89,8 @@ def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, dep
88
89
'nbins' : nbins ,
89
90
'learn_rate' : learnRate ,
90
91
}
91
-
92
+
93
+ parseResult = {'destination_key' : hex_key }
92
94
kwargs = params .copy ()
93
95
gbmStart = time .time ()
94
96
#TODO(spencer): Uses jobs to poll for gbm completion
@@ -97,10 +99,13 @@ def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, dep
97
99
gbmTime = time .time () - gbmStart
98
100
row .update ( {'gbmBuildTime' : gbmTime ,
99
101
})
100
- #TODO(spencer): Add in gbm scoring
101
- #gbmScoreStart = time.time()
102
- #gbmScore = h2o_cmd.runGLMScore(key=testFilehex,model_key=params['destination_key'])
103
- #scoreTime = time.time() - gbmScoreStart
102
+ gbmTrainView = h2o_cmd .runGBMView (model_key = 'GBM(' + f + ')' )
103
+ if classification :
104
+ cm = gbmTrainView ['gbm_model' ]['cm' ]
105
+ err = 1.0 * (cm [0 ][1 ] + cm [1 ][0 ]) / (cm [0 ][0 ] + cm [0 ][1 ] + cm [1 ][0 ] + cm [1 ][1 ])
106
+ else :
107
+ err = gbmTrainView ['gbm_model' ]['errs' ][- 1 ]
108
+ row .update ({'Error' : err })
104
109
csvWrt .writerow (row )
105
110
finally :
106
111
output .close ()
@@ -129,7 +134,7 @@ def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, dep
129
134
doGBM (files ['Airlines' ], folderPath = 'Airlines' ,
130
135
ignored_cols = ignored ,
131
136
classification = 1 ,
132
- testFilehex = testFile [ 'destination_key' ],
137
+ testFilehex = 'atest.hex' ,
133
138
ntrees = 100 ,
134
139
depth = 5 ,
135
140
minrows = 10 ,
@@ -172,14 +177,14 @@ def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, dep
172
177
headerKey = h2i .find_key (hK )
173
178
testFile = h2i .import_parse (bucket = 'home-0xdiag-datasets' , path = bench + '/AllBedrooms/AllBedroomsTest.csv' , schema = 'local' , hex_key = "allBTest.hex" , header = 1 , header_from_file = headerKey , separator = 44 ,noPoll = True ,doSummary = False )
174
179
h2o_jobs .pollWaitJobs (timeoutSecs = 7200 , pollTimeoutSecs = 7200 , retryDelaySecs = 5 )
175
- elapsedAllBedroomsParse = time .time () - allBedroomsTestParseStart
180
+ elapsedAllBedroomsTestParse = time .time () - allBedroomsTestParseStart
176
181
row = {'testParseWallTime' : elapsedAllBedroomsTestParse }
177
182
response = 'medrent'
178
183
ignored = None
179
- doGBM (files ['AllBedroom ' ], folderPath = 'AllBedrooms' ,
184
+ doGBM (files ['AllBedrooms ' ], folderPath = 'AllBedrooms' ,
180
185
ignored_cols = ignored ,
181
186
classification = 0 ,
182
- testFilehex = testFile [ 'destination_key' ] ,
187
+ testFilehex = "allBTest.hex" ,
183
188
ntrees = 100 ,
184
189
depth = 5 ,
185
190
minrows = 10 ,
0 commit comments