Adding benchmarking scripts.

Richiexy · Oct 8, 2013 · eeb50e2 · eeb50e2
1 parent 085104c
commit eeb50e2
Show file tree

Hide file tree

Showing 7 changed files with 300 additions and 0 deletions.
diff --git a/bench/glmBench.py b/bench/glmBench.py
@@ -0,0 +1,93 @@
+#GLM bench
+import os, sys, time, csv
+sys.path.append('../py/')
+sys.path.extend(['.','..'])
+import h2o_cmd, h2o, h2o_hosts, h2o_browse as h2b, h2o_import as h2i, h2o_rf
+
+csv_header = ('java_heap_GB','dataset','nTrainRows','nTestRows','nCols','trainParseWallTime','nfolds','glmBuildTime','testParseWallTime','scoreTime','AUC','error','AverageAccuracy')
+
+files      = {'Airlines'    : {'train': ('AirlinesTrain1x', 'AirlinesTrain10x', 'AirlinesTrain100x'),          'test' : 'AirlinesTest'},
+              'AllBedrooms' : {'train': ('AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x'), 'test' : 'AllBedroomsTest'},
+             }
+def doGLM(fs, folderPath, family, link, lambda_, alpha, nfolds, y, x, testFilehex, row):
+    for (f in fs['train']):
+        overallWallStart = time.time()
+        if not os.path.exists('glmbench.csv'):
+            output = open('glmbench.csv','w')
+            output.write(','.join(csv_header)+'\n')
+        else:
+            output = open('glmbench.csv','a')
+        csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, 
+                        dialect='excel', extrasaction='ignore',delimiter=',')
+        try:
+            java_heap_GB = h2o.nodes[0].java_heap_GB
+            #Train File Parsing#
+            importFolderPath = "bench-test/" + folderPath
+            if (f in ['AirlinesTrain1x','AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x']): csvPathname = importFolderPath + "/" + f + '.csv'
+            else: csvPathname = importFolderPath + "/*linked*"
+            hex_key = f + '.hex'
+            trainParseWallStart = time.time()
+            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key,
+                timeoutSecs=3600,retryDelaySecs=5,pollTimeoutSecs=3600)
+            parseWallTime = time.time() - trainParseWallStart
+            #End Train File Parse#
+            print "Parsing training file took ", trainParseWallTime ," seconds." 
+
+            inspect  = h2o.nodes[0].inspect(parseResult['destination_key'])
+
+            row.update( {'java_heap_GB'       : java_heap_GB,
+                         'dataset'            : f,
+                         'nRows'              : inspect['num_rows'],
+                         'nCols'              : inspect['num_cols'],
+                         'ParseWallTime'      : parseWallTime,
+                        })
+
+            params   =  {'key'                : hex_key,
+                         'y'                  : y,
+                         'x'                  : x,
+                         'family'             : family,
+                         'link'               : link,
+                         'lambda'             : lambda_,
+                         'alpha'              : alpha,
+                         'n_folds'            : nfolds,
+                         'destination_key'    : "python_GLM_key",
+                        }
+
+            kwargs    = params.copy()
+            glmStart  = time.time()
+            glm       = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
+            glmTime   = time.time() - glmStart
+            row.update( {'glmBuildTime'       : glmTime,
+                         'AverageAccuracy'    : glm['validations']['err'],
+                        })
+
+            glmScoreStart = time.time()
+            glmScore = h2o_cmd.runGLMScore(key=testFilehex,model_key=glm['GLMModel'])
+            scoreTime = time.time() - glmScoreStart
+
+            row.update( {'scoreTime'          : scoreTime,
+                         'AUC'                : glmScore['validation']['auc'],
+                         'error'              : glmScore['validation']['err'],
+                        })
+            csvWrt.writerow(row)
+        finally:
+            output.close()
+
+if __name__ == '__main__':
+    h2o_hosts.build_cloud_with_hosts()
+    #Test File parse#
+    airlinesTestParseStart      = time.time()
+    testFile                    = h2o.import_parse(bucket='home-0xdiag-datasets', path='bench/Airlines/AirlinesTest.csv', schema-'local', hex_key="atest.hex",timeoutSecs=3600,retryDelaySecs=5, pollTimeoutSecs=3600)
+    elapsedAirlinesTestParse    = time.time() - airlinesTestParseStart
+
+    row = {'testParseWallTime' : elapsedAirlinesTestParse}
+    doGLM(files['Airlines'], 'Airlines', 'binomial', 'logit', 1E-5, 0.5, 10, 'IsDepDelayed', 'Year,Month,DayofMonth,DayOfWeek,DepTime,ArrTime,UniqueCarrier,FlightNum,TailNum,Origin,Dest,Distance',testFile['destination_key'],row)
+
+    allBedroomsTestParseStart   = time/time()
+    testFile                    = h2o.import_parse(bucket='home-0xdiag-datasets', path='bench/AllBedrooms/AllBedroomsTest.csv', schema-'local', hex_key="allBtest.hex",timeoutSecs=3600,retryDelaySecs=5, pollTimeoutSecs=3600)
+    elapsedAllBedroomsTestParse = time.time() - allBedroomsTestParseStart
+
+    row = {'testParseWallTime' : elapsedAllBedroomsTestParse}
+    doGLM(files['AllBedrooms'], 'AllBedrooms', 'gaussian', 'identity', 1E-5, 0.5, 10, 'medrent',x, testFile['destination_key'],row)
+
+    h2o.tear_down_cloud()
diff --git a/bench/kmeansBench.py b/bench/kmeansBench.py
@@ -0,0 +1,68 @@
+#KMeans bench
+import os, sys, time, csv
+sys.path.append('../py/')
+sys.path.extend(['.','..'])
+import h2o_cmd, h2o, h2o_hosts, h2o_browse as h2b, h2o_import as h2i, h2o_rf
+
+csv_header = ('java_heap_GB','dataset','nRows','nCols','parseWallTime','kmeansBuildTime')
+
+files      = {'Airlines'    : {'train': ('AirlinesTrain1x', 'AirlinesTrain10x', 'AirlinesTrain100x'),          'test' : 'AirlinesTest'},
+              'AllBedrooms' : {'train': ('AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x'), 'test' : 'AllBedroomsTest'},
+             }   
+
+def doKMeans(fs, folderPath): 
+    for (f in fs['train']):
+        overallWallStart = time.time()
+        if not os.path.exists('kmeansbench.csv'):
+            output = open('kmeansbench.csv','w')
+            output.write(','.join(csv_header)+'\n')
+        else:
+            output = open('kmeansbench.csv','a')
+        csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, 
+                        dialect='excel', extrasaction='ignore',delimiter=',')
+        try:
+            java_heap_GB = h2o.nodes[0].java_heap_GB
+            #Train File Parsing#
+            importFolderPath = "bench-test/" + folderPath
+            if (f in ['AirlinesTrain1x','AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x']): csvPathname = importFolderPath + "/" + f + '.csv'
+            else: csvPathname = importFolderPath + "/*linked*"
+            hex_key = f + '.hex'
+            trainParseWallStart = time.time()
+            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key,
+                timeoutSecs=3600,retryDelaySecs=5,pollTimeoutSecs=3600)
+            parseWallTime = time.time() - trainParseWallStart
+            #End Train File Parse#
+            print "Parsing training file took ", trainParseWallTime ," seconds." 
+
+            inspect  = h2o.nodes[0].inspect(parseResult['destination_key'])
+
+            row      =  {'java_heap_GB'       : java_heap_GB,
+                         'dataset'            : f,
+                         'nRows'              : inspect['num_rows'],
+                         'nCols'              : inspect['num_cols'],
+                         'ParseWallTime'      : parseWallTime,
+                        }
+
+            params   =  {'source_key'         : hex_key,
+                         'k'                  : 6,
+                         'initialization'     : 'Furthest',
+                         'max_iter'           : 100,
+                         'seed'               : 1234567,
+                         'normalize'          : 0,
+                         #'cols'               : ,
+                         'destination_key'    : "python_KMEANS_key",
+                        }
+            kwargs       = params.copy()
+            kmeansStart  = time.time()
+            kmeans       = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
+            kmeansTime   = time.time() - kmeansStart
+            row.update({'kmeansBuildTime' : kmeansTime})
+            csvWrt.writerow(row)
+        finally:
+            output.close()
+
+if __name__ == '__main__':
+    h2o_hosts.build_cloud_with_hosts()
+    doKMeans(files['Airlines'], 'Airlines')
+    doKMeans(files['AllBedrooms'], 'AllBedrooms')
+    h2o.tear_down_cloud()
diff --git a/bench/pcaBench.py b/bench/pcaBench.py
@@ -0,0 +1,64 @@
+#PCA bench
+import os, sys, time, csv
+sys.path.append('../py/')
+sys.path.extend(['.','..'])
+import h2o_cmd, h2o, h2o_hosts, h2o_browse as h2b, h2o_import as h2i, h2o_rf, h2o_util
+import h2o_glm, h2o_exec as h2e, h2o_jobs
+csv_header = ('java_heap_GB','dataset','nRows','nCols','parseWallTime','pcaBuildTime')
+
+files      = {'Airlines'    : {'train': ('AirlinesTrain1x', 'AirlinesTrain10x', 'AirlinesTrain100x'),          'test' : 'AirlinesTest'},
+              'AllBedrooms' : {'train': ('AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x'), 'test' : 'AllBedroomsTest'},
+             }
+
+def doPCA(fs, folderPath): 
+    for f in fs['train']:
+        print "Doing PCA on ", f
+        overallWallStart = time.time()
+        if not os.path.exists('pcabench.csv'):
+            output = open('pcabench.csv','w')
+            output.write(','.join(csv_header)+'\n')
+        else:
+            output = open('pcabench.csv','a')
+        csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, 
+                        dialect='excel', extrasaction='ignore',delimiter=',')
+        try:
+            java_heap_GB = h2o.nodes[0].java_heap_GB
+            importFolderPath = "bench-test/" + folderPath
+            if (f in ['AirlinesTrain1x','AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x']): csvPathname = importFolderPath + "/" + f + '.csv'
+            else: csvPathname = importFolderPath + "/f/*linked*"
+            hex_key = f + '.hex'
+            trainParseWallStart = time.time()
+            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key,
+                timeoutSecs=3600,retryDelaySecs=5,pollTimeoutSecs=3600)
+            parseWallTime = time.time() - trainParseWallStart
+            print "Parsing training file took ", trainParseWallTime ," seconds." 
+
+            inspect  = h2o.nodes[0].inspect(parseResult['destination_key'])
+
+            row      =  {'java_heap_GB'       : java_heap_GB,
+                         'dataset'            : f,
+                         'nRows'              : inspect['num_rows'],
+                         'nCols'              : inspect['num_cols'],
+                         'ParseWallTime'      : parseWallTime,
+                        }
+
+            params   =  {'destination_key'    : "python_PCA_key",
+                         'ignore'             : 0,
+                         'tolerance'          : 0.0,
+                         'standardize'        : 1,
+                        }
+
+            kwargs    = params.copy()
+            pcaStart  = time.time()
+            pcaResult = h2o_cmd.runPCA(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
+            pcaTime   = time.time() - pcaStart
+            row.update({'pcaBuildTime' : pcaTime})
+            csvWrt.writerow(row)
+        finally:
+            output.close()
+
+if __name__ == '__main__':
+    h2o_hosts.build_cloud_with_hosts()
+    doPCA(files['Airlines'], 'Airlines')
+    doPCA(files['AllBedrooms'], 'AllBedrooms')
+    h2o.tear_down_cloud()
diff --git a/bench/pcabench.csv b/bench/pcabench.csv
@@ -0,0 +1 @@
+java_heap_GB,dataset,nRows,nCols,parseWallTime,pcaBuildTime
diff --git a/bench/pytest_config-161-164.json b/bench/pytest_config-161-164.json
@@ -0,0 +1,26 @@
+
+{
+     "_comment" : "no password, or None,  means none will be used in ssh. Assumes keys have been exchanged. Watch out for common error: last entry in list can't have comma. Only ip is used. Others ignored, saved here for sometime use. Last ip value used if duplicates? Leading _ means not used. First ip/h2o is used for H2O json communication..vary ip list to direct that. See http://docs.python.org/library/json.html for json->python data types. Note lower case for true/false. null is None, but can just not have an entry. Defaults are in h2o_hosts.py",
+
+    "username": "0xdiag", 
+    "password": "0xdiag", 
+    "h2o_remote_buckets_root": "/home/0xdiag",
+    "use_home_for_ice": true,
+
+    "h2o_per_host": 1, 
+    "java_heap_GB": 100, 
+
+    "hdfs_name_node": "192.168.1.176",
+    "hdfs_version": "cdh3",
+    "use_hdfs": true,
+
+    "use_flatfile": true,
+
+    "ip": [
+        "192.168.1.161",
+        "192.168.1.162",
+        "192.168.1.163",
+        "192.168.1.164"
+    ]
+}
+
diff --git a/bench/pytest_config-161.json b/bench/pytest_config-161.json
@@ -0,0 +1,24 @@
+
+{
+     "_comment" : "no password, or None,  means none will be used in ssh. Assumes keys have been exchanged. Watch out for common error: last entry in list can't have comma. Only ip is used. Others ignored, saved here for sometime use. Last ip value used if duplicates? Leading _ means not used. First ip/h2o is used for H2O json communication..vary ip list to direct that. See http://docs.python.org/library/json.html for json->python data types. Note lower case for true/false. null is None, but can just not have an entry. Defaults are in h2o_hosts.py",
+
+    "username": "0xdiag", 
+    "password": "0xdiag", 
+    "h2o_remote_buckets_root": "/home/0xdiag",
+    "use_home_for_ice": true,
+
+    "h2o_per_host": 1, 
+    "java_heap_GB": 100, 
+
+    "hdfs_name_node": "192.168.1.176",
+    "hdfs_version": "cdh3",
+    "use_hdfs": true,
+
+    "use_flatfile": true,
+    "base_port": 55555,
+
+    "ip": [
+        "192.168.1.161"
+    ]
+}
+
diff --git a/bench/pytest_config-spencer.json b/bench/pytest_config-spencer.json
@@ -0,0 +1,24 @@
+
+{
+     "_comment" : "no password, or None,  means none will be used in ssh. Assumes keys have been exchanged. Watch out for common error: last entry in list can't have comma. Only ip is used. Others ignored, saved here for sometime use. Last ip value used if duplicates? Leading _ means not used. First ip/h2o is used for H2O json communication..vary ip list to direct that. See http://docs.python.org/library/json.html for json->python data types. Note lower case for true/false. null is None, but can just not have an entry. Defaults are in h2o_hosts.py",
+
+    "username": "0xdiag", 
+    "password": "0xdiag", 
+    "h2o_remote_buckets_root": "/home/0xdiag",
+    "use_home_for_ice": true,
+
+    "h2o_per_host": 1, 
+    "java_heap_GB": 100, 
+
+    "hdfs_name_node": "192.168.1.176",
+    "hdfs_version": "cdh3",
+    "use_hdfs": true,
+
+    "use_flatfile": true,
+    "base_port": 55555,
+
+    "ip": [
+        "192.168.1.164"
+    ]
+}
+
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		java_heap_GB,dataset,nRows,nCols,parseWallTime,pcaBuildTime