Skip to content

Commit

Permalink
Adding benchmarking scripts.
Browse files Browse the repository at this point in the history
  • Loading branch information
spennihana committed Oct 8, 2013
1 parent 085104c commit eeb50e2
Show file tree
Hide file tree
Showing 7 changed files with 300 additions and 0 deletions.
93 changes: 93 additions & 0 deletions bench/glmBench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#GLM bench
import os, sys, time, csv
sys.path.append('../py/')
sys.path.extend(['.','..'])
import h2o_cmd, h2o, h2o_hosts, h2o_browse as h2b, h2o_import as h2i, h2o_rf

csv_header = ('java_heap_GB','dataset','nTrainRows','nTestRows','nCols','trainParseWallTime','nfolds','glmBuildTime','testParseWallTime','scoreTime','AUC','error','AverageAccuracy')

files = {'Airlines' : {'train': ('AirlinesTrain1x', 'AirlinesTrain10x', 'AirlinesTrain100x'), 'test' : 'AirlinesTest'},
'AllBedrooms' : {'train': ('AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x'), 'test' : 'AllBedroomsTest'},
}
def doGLM(fs, folderPath, family, link, lambda_, alpha, nfolds, y, x, testFilehex, row):
for (f in fs['train']):
overallWallStart = time.time()
if not os.path.exists('glmbench.csv'):
output = open('glmbench.csv','w')
output.write(','.join(csv_header)+'\n')
else:
output = open('glmbench.csv','a')
csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None,
dialect='excel', extrasaction='ignore',delimiter=',')
try:
java_heap_GB = h2o.nodes[0].java_heap_GB
#Train File Parsing#
importFolderPath = "bench-test/" + folderPath
if (f in ['AirlinesTrain1x','AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x']): csvPathname = importFolderPath + "/" + f + '.csv'
else: csvPathname = importFolderPath + "/*linked*"
hex_key = f + '.hex'
trainParseWallStart = time.time()
parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key,
timeoutSecs=3600,retryDelaySecs=5,pollTimeoutSecs=3600)
parseWallTime = time.time() - trainParseWallStart
#End Train File Parse#
print "Parsing training file took ", trainParseWallTime ," seconds."

inspect = h2o.nodes[0].inspect(parseResult['destination_key'])

row.update( {'java_heap_GB' : java_heap_GB,
'dataset' : f,
'nRows' : inspect['num_rows'],
'nCols' : inspect['num_cols'],
'ParseWallTime' : parseWallTime,
})

params = {'key' : hex_key,
'y' : y,
'x' : x,
'family' : family,
'link' : link,
'lambda' : lambda_,
'alpha' : alpha,
'n_folds' : nfolds,
'destination_key' : "python_GLM_key",
}

kwargs = params.copy()
glmStart = time.time()
glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
glmTime = time.time() - glmStart
row.update( {'glmBuildTime' : glmTime,
'AverageAccuracy' : glm['validations']['err'],
})

glmScoreStart = time.time()
glmScore = h2o_cmd.runGLMScore(key=testFilehex,model_key=glm['GLMModel'])
scoreTime = time.time() - glmScoreStart

row.update( {'scoreTime' : scoreTime,
'AUC' : glmScore['validation']['auc'],
'error' : glmScore['validation']['err'],
})
csvWrt.writerow(row)
finally:
output.close()

if __name__ == '__main__':
h2o_hosts.build_cloud_with_hosts()
#Test File parse#
airlinesTestParseStart = time.time()
testFile = h2o.import_parse(bucket='home-0xdiag-datasets', path='bench/Airlines/AirlinesTest.csv', schema-'local', hex_key="atest.hex",timeoutSecs=3600,retryDelaySecs=5, pollTimeoutSecs=3600)
elapsedAirlinesTestParse = time.time() - airlinesTestParseStart

row = {'testParseWallTime' : elapsedAirlinesTestParse}
doGLM(files['Airlines'], 'Airlines', 'binomial', 'logit', 1E-5, 0.5, 10, 'IsDepDelayed', 'Year,Month,DayofMonth,DayOfWeek,DepTime,ArrTime,UniqueCarrier,FlightNum,TailNum,Origin,Dest,Distance',testFile['destination_key'],row)

allBedroomsTestParseStart = time/time()
testFile = h2o.import_parse(bucket='home-0xdiag-datasets', path='bench/AllBedrooms/AllBedroomsTest.csv', schema-'local', hex_key="allBtest.hex",timeoutSecs=3600,retryDelaySecs=5, pollTimeoutSecs=3600)
elapsedAllBedroomsTestParse = time.time() - allBedroomsTestParseStart

row = {'testParseWallTime' : elapsedAllBedroomsTestParse}
doGLM(files['AllBedrooms'], 'AllBedrooms', 'gaussian', 'identity', 1E-5, 0.5, 10, 'medrent',x, testFile['destination_key'],row)

h2o.tear_down_cloud()
68 changes: 68 additions & 0 deletions bench/kmeansBench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#KMeans bench
import os, sys, time, csv
sys.path.append('../py/')
sys.path.extend(['.','..'])
import h2o_cmd, h2o, h2o_hosts, h2o_browse as h2b, h2o_import as h2i, h2o_rf

csv_header = ('java_heap_GB','dataset','nRows','nCols','parseWallTime','kmeansBuildTime')

files = {'Airlines' : {'train': ('AirlinesTrain1x', 'AirlinesTrain10x', 'AirlinesTrain100x'), 'test' : 'AirlinesTest'},
'AllBedrooms' : {'train': ('AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x'), 'test' : 'AllBedroomsTest'},
}

def doKMeans(fs, folderPath):
for (f in fs['train']):
overallWallStart = time.time()
if not os.path.exists('kmeansbench.csv'):
output = open('kmeansbench.csv','w')
output.write(','.join(csv_header)+'\n')
else:
output = open('kmeansbench.csv','a')
csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None,
dialect='excel', extrasaction='ignore',delimiter=',')
try:
java_heap_GB = h2o.nodes[0].java_heap_GB
#Train File Parsing#
importFolderPath = "bench-test/" + folderPath
if (f in ['AirlinesTrain1x','AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x']): csvPathname = importFolderPath + "/" + f + '.csv'
else: csvPathname = importFolderPath + "/*linked*"
hex_key = f + '.hex'
trainParseWallStart = time.time()
parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key,
timeoutSecs=3600,retryDelaySecs=5,pollTimeoutSecs=3600)
parseWallTime = time.time() - trainParseWallStart
#End Train File Parse#
print "Parsing training file took ", trainParseWallTime ," seconds."

inspect = h2o.nodes[0].inspect(parseResult['destination_key'])

row = {'java_heap_GB' : java_heap_GB,
'dataset' : f,
'nRows' : inspect['num_rows'],
'nCols' : inspect['num_cols'],
'ParseWallTime' : parseWallTime,
}

params = {'source_key' : hex_key,
'k' : 6,
'initialization' : 'Furthest',
'max_iter' : 100,
'seed' : 1234567,
'normalize' : 0,
#'cols' : ,
'destination_key' : "python_KMEANS_key",
}
kwargs = params.copy()
kmeansStart = time.time()
kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
kmeansTime = time.time() - kmeansStart
row.update({'kmeansBuildTime' : kmeansTime})
csvWrt.writerow(row)
finally:
output.close()

if __name__ == '__main__':
h2o_hosts.build_cloud_with_hosts()
doKMeans(files['Airlines'], 'Airlines')
doKMeans(files['AllBedrooms'], 'AllBedrooms')
h2o.tear_down_cloud()
64 changes: 64 additions & 0 deletions bench/pcaBench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#PCA bench
import os, sys, time, csv
sys.path.append('../py/')
sys.path.extend(['.','..'])
import h2o_cmd, h2o, h2o_hosts, h2o_browse as h2b, h2o_import as h2i, h2o_rf, h2o_util
import h2o_glm, h2o_exec as h2e, h2o_jobs
csv_header = ('java_heap_GB','dataset','nRows','nCols','parseWallTime','pcaBuildTime')

files = {'Airlines' : {'train': ('AirlinesTrain1x', 'AirlinesTrain10x', 'AirlinesTrain100x'), 'test' : 'AirlinesTest'},
'AllBedrooms' : {'train': ('AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x'), 'test' : 'AllBedroomsTest'},
}

def doPCA(fs, folderPath):
for f in fs['train']:
print "Doing PCA on ", f
overallWallStart = time.time()
if not os.path.exists('pcabench.csv'):
output = open('pcabench.csv','w')
output.write(','.join(csv_header)+'\n')
else:
output = open('pcabench.csv','a')
csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None,
dialect='excel', extrasaction='ignore',delimiter=',')
try:
java_heap_GB = h2o.nodes[0].java_heap_GB
importFolderPath = "bench-test/" + folderPath
if (f in ['AirlinesTrain1x','AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x']): csvPathname = importFolderPath + "/" + f + '.csv'
else: csvPathname = importFolderPath + "/f/*linked*"
hex_key = f + '.hex'
trainParseWallStart = time.time()
parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key,
timeoutSecs=3600,retryDelaySecs=5,pollTimeoutSecs=3600)
parseWallTime = time.time() - trainParseWallStart
print "Parsing training file took ", trainParseWallTime ," seconds."

inspect = h2o.nodes[0].inspect(parseResult['destination_key'])

row = {'java_heap_GB' : java_heap_GB,
'dataset' : f,
'nRows' : inspect['num_rows'],
'nCols' : inspect['num_cols'],
'ParseWallTime' : parseWallTime,
}

params = {'destination_key' : "python_PCA_key",
'ignore' : 0,
'tolerance' : 0.0,
'standardize' : 1,
}

kwargs = params.copy()
pcaStart = time.time()
pcaResult = h2o_cmd.runPCA(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
pcaTime = time.time() - pcaStart
row.update({'pcaBuildTime' : pcaTime})
csvWrt.writerow(row)
finally:
output.close()

if __name__ == '__main__':
h2o_hosts.build_cloud_with_hosts()
doPCA(files['Airlines'], 'Airlines')
doPCA(files['AllBedrooms'], 'AllBedrooms')
h2o.tear_down_cloud()
1 change: 1 addition & 0 deletions bench/pcabench.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
java_heap_GB,dataset,nRows,nCols,parseWallTime,pcaBuildTime
26 changes: 26 additions & 0 deletions bench/pytest_config-161-164.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@

{
"_comment" : "no password, or None, means none will be used in ssh. Assumes keys have been exchanged. Watch out for common error: last entry in list can't have comma. Only ip is used. Others ignored, saved here for sometime use. Last ip value used if duplicates? Leading _ means not used. First ip/h2o is used for H2O json communication..vary ip list to direct that. See http://docs.python.org/library/json.html for json->python data types. Note lower case for true/false. null is None, but can just not have an entry. Defaults are in h2o_hosts.py",

"username": "0xdiag",
"password": "0xdiag",
"h2o_remote_buckets_root": "/home/0xdiag",
"use_home_for_ice": true,

"h2o_per_host": 1,
"java_heap_GB": 100,

"hdfs_name_node": "192.168.1.176",
"hdfs_version": "cdh3",
"use_hdfs": true,

"use_flatfile": true,

"ip": [
"192.168.1.161",
"192.168.1.162",
"192.168.1.163",
"192.168.1.164"
]
}

24 changes: 24 additions & 0 deletions bench/pytest_config-161.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@

{
"_comment" : "no password, or None, means none will be used in ssh. Assumes keys have been exchanged. Watch out for common error: last entry in list can't have comma. Only ip is used. Others ignored, saved here for sometime use. Last ip value used if duplicates? Leading _ means not used. First ip/h2o is used for H2O json communication..vary ip list to direct that. See http://docs.python.org/library/json.html for json->python data types. Note lower case for true/false. null is None, but can just not have an entry. Defaults are in h2o_hosts.py",

"username": "0xdiag",
"password": "0xdiag",
"h2o_remote_buckets_root": "/home/0xdiag",
"use_home_for_ice": true,

"h2o_per_host": 1,
"java_heap_GB": 100,

"hdfs_name_node": "192.168.1.176",
"hdfs_version": "cdh3",
"use_hdfs": true,

"use_flatfile": true,
"base_port": 55555,

"ip": [
"192.168.1.161"
]
}

24 changes: 24 additions & 0 deletions bench/pytest_config-spencer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@

{
"_comment" : "no password, or None, means none will be used in ssh. Assumes keys have been exchanged. Watch out for common error: last entry in list can't have comma. Only ip is used. Others ignored, saved here for sometime use. Last ip value used if duplicates? Leading _ means not used. First ip/h2o is used for H2O json communication..vary ip list to direct that. See http://docs.python.org/library/json.html for json->python data types. Note lower case for true/false. null is None, but can just not have an entry. Defaults are in h2o_hosts.py",

"username": "0xdiag",
"password": "0xdiag",
"h2o_remote_buckets_root": "/home/0xdiag",
"use_home_for_ice": true,

"h2o_per_host": 1,
"java_heap_GB": 100,

"hdfs_name_node": "192.168.1.176",
"hdfs_version": "cdh3",
"use_hdfs": true,

"use_flatfile": true,
"base_port": 55555,

"ip": [
"192.168.1.164"
]
}

0 comments on commit eeb50e2

Please sign in to comment.