Skip to content

Commit 197d31c

Browse files
committed
fully working fvec scripts
1 parent d066a25 commit 197d31c

File tree

2 files changed

+41
-33
lines changed

2 files changed

+41
-33
lines changed

bench/BMscripts/gbmBench.py

+19-14
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
sys.path.extend(['.','..'])
55
import h2o_cmd, h2o, h2o_hosts, h2o_browse as h2b, h2o_import as h2i, h2o_rf, h2o_jobs
66

7-
csv_header = ('h2o_build','nMachines','nJVMs','Xmx/JVM','dataset','nTrainRows','nTestRows','nCols','trainParseWallTime','classification','gbmBuildTime')
7+
csv_header = ('h2o_build','nMachines','nJVMs','Xmx/JVM','dataset','nTrainRows','nTestRows','nCols','trainParseWallTime','classification','gbmBuildTime','Error')
88

99
files = {'Airlines' : {'train': ('AirlinesTrain1x', 'AirlinesTrain10x', 'AirlinesTrain100x'), 'test' : 'AirlinesTest'},
1010
'AllBedrooms': {'train': ('AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x'), 'test' : 'AllBedroomsTest'},
@@ -33,7 +33,7 @@ def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, dep
3333
dialect='excel', extrasaction='ignore',delimiter=',')
3434
try:
3535
java_heap_GB = h2o.nodes[0].java_heap_GB
36-
importFolderPath = bench + folderPath
36+
importFolderPath = bench + "/" + folderPath
3737
if (f in ['AirlinesTrain1x','AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x','CovTypeTrain1x', 'CovTypeTrain10x', 'CovTypeTrain100x']):
3838
csvPathname = importFolderPath + "/" + f + '.csv'
3939
else:
@@ -44,6 +44,7 @@ def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, dep
4444
h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
4545
headerKey = h2i.find_key(hK)
4646
trainParseWallStart = time.time()
47+
if f in (['AirlinesTrain10x', 'AirlinesTrain100x']): h2o.beta_features = False #regex parsing acting weird when not using browser, use VA -> FVEC converter
4748
parseResult = h2i.import_parse(bucket = 'home-0xdiag-datasets',
4849
path = csvPathname,
4950
schema = 'local',
@@ -56,12 +57,12 @@ def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, dep
5657
pollTimeoutSecs = 7200,
5758
noPoll = True,
5859
doSummary = False
59-
)
60+
)
6061
h2o_jobs.pollWaitJobs(timeoutSecs=7200, pollTimeoutSecs=7200, retryDelaySecs=5)
6162
parseWallTime = time.time() - trainParseWallStart
6263
print "Parsing training file took ", parseWallTime ," seconds."
63-
64-
inspect_train = h2o.nodes[0].inspect(parseResult['destination_key'])
64+
h2o.beta_features = True
65+
inspect_train = h2o.nodes[0].inspect(hex_key)
6566
inspect_test = h2o.nodes[0].inspect(testFilehex)
6667

6768
nMachines = 1 if len(h2o_hosts.hosts) is 0 else len(h2o_hosts.hosts)
@@ -88,7 +89,8 @@ def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, dep
8889
'nbins' : nbins,
8990
'learn_rate' : learnRate,
9091
}
91-
92+
93+
parseResult = {'destination_key' : hex_key}
9294
kwargs = params.copy()
9395
gbmStart = time.time()
9496
#TODO(spencer): Uses jobs to poll for gbm completion
@@ -97,10 +99,13 @@ def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, dep
9799
gbmTime = time.time() - gbmStart
98100
row.update( {'gbmBuildTime' : gbmTime,
99101
})
100-
#TODO(spencer): Add in gbm scoring
101-
#gbmScoreStart = time.time()
102-
#gbmScore = h2o_cmd.runGLMScore(key=testFilehex,model_key=params['destination_key'])
103-
#scoreTime = time.time() - gbmScoreStart
102+
gbmTrainView = h2o_cmd.runGBMView(model_key='GBM('+f+')')
103+
if classification:
104+
cm = gbmTrainView['gbm_model']['cm']
105+
err = 1.0*(cm[0][1] + cm[1][0]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1])
106+
else:
107+
err = gbmTrainView['gbm_model']['errs'][-1]
108+
row.update({'Error' : err})
104109
csvWrt.writerow(row)
105110
finally:
106111
output.close()
@@ -129,7 +134,7 @@ def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, dep
129134
doGBM(files['Airlines'], folderPath='Airlines',
130135
ignored_cols = ignored,
131136
classification = 1,
132-
testFilehex = testFile['destination_key'],
137+
testFilehex = 'atest.hex',
133138
ntrees = 100,
134139
depth = 5,
135140
minrows = 10,
@@ -172,14 +177,14 @@ def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, dep
172177
headerKey = h2i.find_key(hK)
173178
testFile = h2i.import_parse(bucket='home-0xdiag-datasets', path=bench+'/AllBedrooms/AllBedroomsTest.csv', schema='local', hex_key="allBTest.hex", header=1, header_from_file=headerKey, separator=44,noPoll=True,doSummary=False)
174179
h2o_jobs.pollWaitJobs(timeoutSecs=7200, pollTimeoutSecs=7200, retryDelaySecs=5)
175-
elapsedAllBedroomsParse = time.time() - allBedroomsTestParseStart
180+
elapsedAllBedroomsTestParse = time.time() - allBedroomsTestParseStart
176181
row = {'testParseWallTime' : elapsedAllBedroomsTestParse}
177182
response = 'medrent'
178183
ignored = None
179-
doGBM(files['AllBedroom'], folderPath='AllBedrooms',
184+
doGBM(files['AllBedrooms'], folderPath='AllBedrooms',
180185
ignored_cols = ignored,
181186
classification = 0,
182-
testFilehex = testFile['destination_key'],
187+
testFilehex = "allBTest.hex",
183188
ntrees = 100,
184189
depth = 5,
185190
minrows = 10,

bench/BMscripts/glm2Bench.py

+22-19
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import h2o_cmd, h2o, h2o_hosts, h2o_browse as h2b, h2o_import as h2i, h2o_rf, h2o_jobs
66
from pprint import pprint
77

8-
csv_header = ('h2o_build','java_heap_GB','dataset','nTrainRows','nTestRows','nCols','trainParseWallTime','nfolds','glmBuildTime','testParseWallTime','scoreTime','AUC','AIC','error')
8+
csv_header = ('h2o_build','java_heap_GB','dataset','nTrainRows','nTestRows','nCols','nPredictors','trainParseWallTime','nfolds','glmBuildTime','testParseWallTime','nIterations','AUC','AIC','AverageError')
99

1010
files = {'Airlines' : {'train': ('AirlinesTrain1x', 'AirlinesTrain10x', 'AirlinesTrain100x'), 'test' : 'AirlinesTest'},
1111
'AllBedrooms' : {'train': ('AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x'), 'test' : 'AllBedroomsTest'},
@@ -36,16 +36,16 @@ def doGLM2(fs, folderPath, family, lambda_, alpha, nfolds, y, x, testFilehex, ro
3636
if (f in ['AirlinesTrain1x','AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x']):
3737
csvPathname = importFolderPath + "/" + f + '.csv'
3838
else:
39-
print "Not doing Airlines10x and 100x for Parse2, regex seems to be broken..."
40-
continue
41-
#csvPathname = importFolderPath + "/" + f + "/*linked*"
39+
#print "Not doing Airlines10x and 100x for Parse2, regex seems to be broken..."
40+
#continue
41+
csvPathname = importFolderPath + "/" + f + "/*"
4242
hex_key = f + '.hex'
4343
hK = folderPath + "Header.csv"
4444
headerPathname = importFolderPath + "/" + hK
4545
h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
4646
headerKey = h2i.find_key(hK)
4747
trainParseWallStart = time.time()
48-
h2o.beta_features=True
48+
if f in (['AirlinesTrain10x', 'AirlinesTrain100x']): h2o.beta_features = False #regex parsing acting weird when not using browser, use VA -> FVEC converter
4949
parseResult = h2i.import_parse(bucket = 'home-0xdiag-datasets',
5050
path = csvPathname,
5151
schema = 'local',
@@ -63,7 +63,7 @@ def doGLM2(fs, folderPath, family, lambda_, alpha, nfolds, y, x, testFilehex, ro
6363
parseResult = {'destination_key':hex_key}
6464
parseWallTime = time.time() - trainParseWallStart
6565
print "Parsing training file took ", parseWallTime ," seconds."
66-
66+
h2o.beta_features = True
6767
inspect_train = h2o.nodes[0].inspect(hex_key)
6868
inspect_test = h2o.nodes[0].inspect(testFilehex)
6969

@@ -97,24 +97,27 @@ def doGLM2(fs, folderPath, family, lambda_, alpha, nfolds, y, x, testFilehex, ro
9797
row.update( {'glmBuildTime' : glmTime,
9898
#'AverageErrorOver10Folds' : glm['glm_model']['validations'][0]['err'],
9999
})
100+
#if "Bedrooms" in f:
101+
#print "Sleeping 30"
102+
#time.sleep(30)
100103
glmView = h2o_cmd.runGLMView(modelKey = "GLM("+f+")", timeoutSecs=380)
101-
pprint(glmView)
102104

103105
#glmScoreStart = time.time()
104106
#glmScore = h2o_cmd.runGLMScore(key=testFilehex,model_key=params['destination_key'])
105107
#scoreTime = time.time() - glmScoreStart
106-
#if family == "binomial":
107-
# row.update( {'scoreTime' : scoreTime,
108-
# 'AUC' : glmScore['validation']['auc'],
109-
# 'AIC' : glmScore['validation']['aic'],
110-
# 'error' : glmScore['validation']['err'],
111-
# })
112-
#else:
113-
# row.update( {'scoreTime' : scoreTime,
114-
# 'AIC' : glmScore['validation']['aic'],
115-
# 'AUC' : 'NA',
116-
# 'error' : glmScore['validation']['err'],
117-
# })
108+
row.update( {'AIC' : glmView['glm_model']['validation']['aic'],
109+
'nIterations' : glmView['glm_model']['iteration'],
110+
'nPredictors' : len(glmView['glm_model']['beta']),
111+
'AverageError' : glmView['glm_model']['validation']['avg_err'],
112+
})
113+
if family == "binomial":
114+
row.update( {#'scoreTime' : scoreTime,
115+
'AUC' : glmView['glm_model']['validation']['auc'],
116+
})
117+
else:
118+
row.update( {#'scoreTime' : scoreTime,
119+
'AUC' : 'NA',
120+
})
118121
csvWrt.writerow(row)
119122
finally:
120123
output.close()

0 commit comments

Comments
 (0)