Skip to content

Commit

Permalink
fully working fvec scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
spennihana committed Oct 20, 2013
1 parent d066a25 commit 197d31c
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 33 deletions.
33 changes: 19 additions & 14 deletions bench/BMscripts/gbmBench.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
sys.path.extend(['.','..'])
import h2o_cmd, h2o, h2o_hosts, h2o_browse as h2b, h2o_import as h2i, h2o_rf, h2o_jobs

csv_header = ('h2o_build','nMachines','nJVMs','Xmx/JVM','dataset','nTrainRows','nTestRows','nCols','trainParseWallTime','classification','gbmBuildTime')
csv_header = ('h2o_build','nMachines','nJVMs','Xmx/JVM','dataset','nTrainRows','nTestRows','nCols','trainParseWallTime','classification','gbmBuildTime','Error')

files = {'Airlines' : {'train': ('AirlinesTrain1x', 'AirlinesTrain10x', 'AirlinesTrain100x'), 'test' : 'AirlinesTest'},
'AllBedrooms': {'train': ('AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x'), 'test' : 'AllBedroomsTest'},
Expand Down Expand Up @@ -33,7 +33,7 @@ def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, dep
dialect='excel', extrasaction='ignore',delimiter=',')
try:
java_heap_GB = h2o.nodes[0].java_heap_GB
importFolderPath = bench + folderPath
importFolderPath = bench + "/" + folderPath
if (f in ['AirlinesTrain1x','AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x','CovTypeTrain1x', 'CovTypeTrain10x', 'CovTypeTrain100x']):
csvPathname = importFolderPath + "/" + f + '.csv'
else:
Expand All @@ -44,6 +44,7 @@ def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, dep
h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
headerKey = h2i.find_key(hK)
trainParseWallStart = time.time()
if f in (['AirlinesTrain10x', 'AirlinesTrain100x']): h2o.beta_features = False #regex parsing acting weird when not using browser, use VA -> FVEC converter
parseResult = h2i.import_parse(bucket = 'home-0xdiag-datasets',
path = csvPathname,
schema = 'local',
Expand All @@ -56,12 +57,12 @@ def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, dep
pollTimeoutSecs = 7200,
noPoll = True,
doSummary = False
)
)
h2o_jobs.pollWaitJobs(timeoutSecs=7200, pollTimeoutSecs=7200, retryDelaySecs=5)
parseWallTime = time.time() - trainParseWallStart
print "Parsing training file took ", parseWallTime ," seconds."

inspect_train = h2o.nodes[0].inspect(parseResult['destination_key'])
h2o.beta_features = True
inspect_train = h2o.nodes[0].inspect(hex_key)
inspect_test = h2o.nodes[0].inspect(testFilehex)

nMachines = 1 if len(h2o_hosts.hosts) is 0 else len(h2o_hosts.hosts)
Expand All @@ -88,7 +89,8 @@ def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, dep
'nbins' : nbins,
'learn_rate' : learnRate,
}


parseResult = {'destination_key' : hex_key}
kwargs = params.copy()
gbmStart = time.time()
#TODO(spencer): Uses jobs to poll for gbm completion
Expand All @@ -97,10 +99,13 @@ def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, dep
gbmTime = time.time() - gbmStart
row.update( {'gbmBuildTime' : gbmTime,
})
#TODO(spencer): Add in gbm scoring
#gbmScoreStart = time.time()
#gbmScore = h2o_cmd.runGLMScore(key=testFilehex,model_key=params['destination_key'])
#scoreTime = time.time() - gbmScoreStart
gbmTrainView = h2o_cmd.runGBMView(model_key='GBM('+f+')')
if classification:
cm = gbmTrainView['gbm_model']['cm']
err = 1.0*(cm[0][1] + cm[1][0]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1])
else:
err = gbmTrainView['gbm_model']['errs'][-1]
row.update({'Error' : err})
csvWrt.writerow(row)
finally:
output.close()
Expand Down Expand Up @@ -129,7 +134,7 @@ def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, dep
doGBM(files['Airlines'], folderPath='Airlines',
ignored_cols = ignored,
classification = 1,
testFilehex = testFile['destination_key'],
testFilehex = 'atest.hex',
ntrees = 100,
depth = 5,
minrows = 10,
Expand Down Expand Up @@ -172,14 +177,14 @@ def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, dep
headerKey = h2i.find_key(hK)
testFile = h2i.import_parse(bucket='home-0xdiag-datasets', path=bench+'/AllBedrooms/AllBedroomsTest.csv', schema='local', hex_key="allBTest.hex", header=1, header_from_file=headerKey, separator=44,noPoll=True,doSummary=False)
h2o_jobs.pollWaitJobs(timeoutSecs=7200, pollTimeoutSecs=7200, retryDelaySecs=5)
elapsedAllBedroomsParse = time.time() - allBedroomsTestParseStart
elapsedAllBedroomsTestParse = time.time() - allBedroomsTestParseStart
row = {'testParseWallTime' : elapsedAllBedroomsTestParse}
response = 'medrent'
ignored = None
doGBM(files['AllBedroom'], folderPath='AllBedrooms',
doGBM(files['AllBedrooms'], folderPath='AllBedrooms',
ignored_cols = ignored,
classification = 0,
testFilehex = testFile['destination_key'],
testFilehex = "allBTest.hex",
ntrees = 100,
depth = 5,
minrows = 10,
Expand Down
41 changes: 22 additions & 19 deletions bench/BMscripts/glm2Bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import h2o_cmd, h2o, h2o_hosts, h2o_browse as h2b, h2o_import as h2i, h2o_rf, h2o_jobs
from pprint import pprint

csv_header = ('h2o_build','java_heap_GB','dataset','nTrainRows','nTestRows','nCols','trainParseWallTime','nfolds','glmBuildTime','testParseWallTime','scoreTime','AUC','AIC','error')
csv_header = ('h2o_build','java_heap_GB','dataset','nTrainRows','nTestRows','nCols','nPredictors','trainParseWallTime','nfolds','glmBuildTime','testParseWallTime','nIterations','AUC','AIC','AverageError')

files = {'Airlines' : {'train': ('AirlinesTrain1x', 'AirlinesTrain10x', 'AirlinesTrain100x'), 'test' : 'AirlinesTest'},
'AllBedrooms' : {'train': ('AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x'), 'test' : 'AllBedroomsTest'},
Expand Down Expand Up @@ -36,16 +36,16 @@ def doGLM2(fs, folderPath, family, lambda_, alpha, nfolds, y, x, testFilehex, ro
if (f in ['AirlinesTrain1x','AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x']):
csvPathname = importFolderPath + "/" + f + '.csv'
else:
print "Not doing Airlines10x and 100x for Parse2, regex seems to be broken..."
continue
#csvPathname = importFolderPath + "/" + f + "/*linked*"
#print "Not doing Airlines10x and 100x for Parse2, regex seems to be broken..."
#continue
csvPathname = importFolderPath + "/" + f + "/*"
hex_key = f + '.hex'
hK = folderPath + "Header.csv"
headerPathname = importFolderPath + "/" + hK
h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
headerKey = h2i.find_key(hK)
trainParseWallStart = time.time()
h2o.beta_features=True
if f in (['AirlinesTrain10x', 'AirlinesTrain100x']): h2o.beta_features = False #regex parsing acting weird when not using browser, use VA -> FVEC converter
parseResult = h2i.import_parse(bucket = 'home-0xdiag-datasets',
path = csvPathname,
schema = 'local',
Expand All @@ -63,7 +63,7 @@ def doGLM2(fs, folderPath, family, lambda_, alpha, nfolds, y, x, testFilehex, ro
parseResult = {'destination_key':hex_key}
parseWallTime = time.time() - trainParseWallStart
print "Parsing training file took ", parseWallTime ," seconds."

h2o.beta_features = True
inspect_train = h2o.nodes[0].inspect(hex_key)
inspect_test = h2o.nodes[0].inspect(testFilehex)

Expand Down Expand Up @@ -97,24 +97,27 @@ def doGLM2(fs, folderPath, family, lambda_, alpha, nfolds, y, x, testFilehex, ro
row.update( {'glmBuildTime' : glmTime,
#'AverageErrorOver10Folds' : glm['glm_model']['validations'][0]['err'],
})
#if "Bedrooms" in f:
#print "Sleeping 30"
#time.sleep(30)
glmView = h2o_cmd.runGLMView(modelKey = "GLM("+f+")", timeoutSecs=380)
pprint(glmView)

#glmScoreStart = time.time()
#glmScore = h2o_cmd.runGLMScore(key=testFilehex,model_key=params['destination_key'])
#scoreTime = time.time() - glmScoreStart
#if family == "binomial":
# row.update( {'scoreTime' : scoreTime,
# 'AUC' : glmScore['validation']['auc'],
# 'AIC' : glmScore['validation']['aic'],
# 'error' : glmScore['validation']['err'],
# })
#else:
# row.update( {'scoreTime' : scoreTime,
# 'AIC' : glmScore['validation']['aic'],
# 'AUC' : 'NA',
# 'error' : glmScore['validation']['err'],
# })
row.update( {'AIC' : glmView['glm_model']['validation']['aic'],
'nIterations' : glmView['glm_model']['iteration'],
'nPredictors' : len(glmView['glm_model']['beta']),
'AverageError' : glmView['glm_model']['validation']['avg_err'],
})
if family == "binomial":
row.update( {#'scoreTime' : scoreTime,
'AUC' : glmView['glm_model']['validation']['auc'],
})
else:
row.update( {#'scoreTime' : scoreTime,
'AUC' : 'NA',
})
csvWrt.writerow(row)
finally:
output.close()
Expand Down

0 comments on commit 197d31c

Please sign in to comment.