8
8
9
9
files = {'Airlines' : {'train' : ('AirlinesTrain1x' , 'AirlinesTrain10x' , 'AirlinesTrain100x' ), 'test' : 'AirlinesTest' },
10
10
'AllBedrooms' : {'train' : ('AllBedroomsTrain1x' , 'AllBedroomsTrain10x' , 'AllBedroomsTrain100x' ), 'test' : 'AllBedroomsTest' },
11
- 'Airlines100' : {'train' : ('AirlinesTrain100x' ), 'test' : 'AirlinesTest' },
12
11
'Covtype' : {'train' : ('CovTypeTrain1x' , 'CovTypeTrain10x' , 'CovTypeTrain100x' ), 'test' : 'CovTypeTest' },
13
12
}
14
13
build = ""
15
14
debug = False
16
- def doGBM (fs , folderPath , ignored_cols , classification , testFilehex , ntrees , depth , minrows , nbins , learnRate , response , row ):
15
+ def doGBM (f , folderPath , ignored_cols , classification , testFilehex , ntrees , depth , minrows , nbins , learnRate , response , row ):
17
16
debug = False
18
17
h2o .beta_features = True
19
18
bench = "bench"
20
19
if debug :
21
20
print "Doing GBM DEBUG"
22
21
bench = "bench/debug"
23
22
date = '-' .join ([str (x ) for x in list (time .localtime ())][0 :3 ])
24
- for f in fs ['train' ]:
25
- overallWallStart = time .time ()
26
- pre = ""
27
- if debug : pre = 'DEBUG'
28
- gbmbenchcsv = 'benchmarks/' + build + '/' + date + '/' + pre + 'gbmbench.csv'
29
- if not os .path .exists (gbmbenchcsv ):
30
- output = open (gbmbenchcsv ,'w' )
31
- output .write (',' .join (csv_header )+ '\n ' )
32
- else :
33
- output = open (gbmbenchcsv ,'a' )
34
- csvWrt = csv .DictWriter (output , fieldnames = csv_header , restval = None ,
35
- dialect = 'excel' , extrasaction = 'ignore' ,delimiter = ',' )
36
- try :
37
- java_heap_GB = h2o .nodes [0 ].java_heap_GB
38
- importFolderPath = bench + "/" + folderPath
39
- if (f in ['AirlinesTrain1x' ,'AllBedroomsTrain1x' , 'AllBedroomsTrain10x' , 'AllBedroomsTrain100x' ,'CovTypeTrain1x' , 'CovTypeTrain10x' , 'CovTypeTrain100x' ]):
40
- csvPathname = importFolderPath + "/" + f + '.csv'
41
- else :
42
- csvPathname = importFolderPath + "/" + f + "/*linked*"
43
- hex_key = f + '.hex'
44
- hK = folderPath + "Header.csv"
45
- headerPathname = importFolderPath + "/" + hK
46
- h2i .import_only (bucket = 'home-0xdiag-datasets' , path = headerPathname )
47
- headerKey = h2i .find_key (hK )
48
- trainParseWallStart = time .time ()
49
- if f in (['AirlinesTrain10x' , 'AirlinesTrain100x' ]): h2o .beta_features = False #regex parsing acting weird when not using browser, use VA -> FVEC converter
50
- parseResult = h2i .import_parse (bucket = 'home-0xdiag-datasets' ,
51
- path = csvPathname ,
52
- schema = 'local' ,
53
- hex_key = hex_key ,
54
- header = 1 ,
55
- header_from_file = headerKey ,
56
- separator = 44 ,
57
- timeoutSecs = 7200 ,
58
- retryDelaySecs = 5 ,
59
- pollTimeoutSecs = 7200 ,
60
- noPoll = True ,
61
- doSummary = False
62
- )
63
- h2o_jobs .pollWaitJobs (timeoutSecs = 7200 , pollTimeoutSecs = 7200 , retryDelaySecs = 5 )
64
- parseWallTime = time .time () - trainParseWallStart
65
- print "Parsing training file took " , parseWallTime ," seconds."
66
- #h2o.beta_features = True
67
- inspect_train = h2o .nodes [0 ].inspect (hex_key , timeoutSecs = 7200 )
68
- inspect_test = h2o .nodes [0 ].inspect (testFilehex , timeoutSecs = 7200 )
69
- h2o .beta_features = True
70
- nMachines = 1 if len (h2o_hosts .hosts ) is 0 else len (h2o_hosts .hosts )
71
- row .update ( {'h2o_build' : build ,
72
- 'nMachines' : nMachines ,
73
- 'nJVMs' : len (h2o .nodes ),
74
- 'Xmx/JVM' : java_heap_GB ,
75
- 'dataset' : f ,
76
- 'nTrainRows' : inspect_train ['num_rows' ],
77
- 'nTestRows' : inspect_test ['num_rows' ],
78
- 'nCols' : inspect_train ['num_cols' ],
79
- 'trainParseWallTime' : parseWallTime ,
80
- 'classification' : classification ,
81
- })
82
-
83
- params = {'destination_key' : 'GBM(' + f + ')' ,
84
- 'response' : response ,
85
- 'ignored_cols_by_name' : ignored_cols ,
86
- 'classification' : classification ,
87
- 'validation' : testFilehex ,
88
- 'ntrees' : ntrees ,
89
- 'max_depth' : depth ,
90
- 'min_rows' : minrows ,
91
- 'nbins' : nbins ,
92
- 'learn_rate' : learnRate ,
93
- }
23
+ overallWallStart = time .time ()
24
+ pre = ""
25
+ if debug : pre = 'DEBUG'
26
+ gbmbenchcsv = 'benchmarks/' + build + '/' + date + '/' + pre + 'gbmbench.csv'
27
+ if not os .path .exists (gbmbenchcsv ):
28
+ output = open (gbmbenchcsv ,'w' )
29
+ output .write (',' .join (csv_header )+ '\n ' )
30
+ else :
31
+ output = open (gbmbenchcsv ,'a' )
32
+ csvWrt = csv .DictWriter (output , fieldnames = csv_header , restval = None ,
33
+ dialect = 'excel' , extrasaction = 'ignore' ,delimiter = ',' )
34
+ try :
35
+ java_heap_GB = h2o .nodes [0 ].java_heap_GB
36
+ importFolderPath = bench + "/" + folderPath
37
+ if (f in ['AirlinesTrain1x' ,'AllBedroomsTrain1x' , 'AllBedroomsTrain10x' , 'AllBedroomsTrain100x' ,'CovTypeTrain1x' , 'CovTypeTrain10x' , 'CovTypeTrain100x' ]):
38
+ csvPathname = importFolderPath + "/" + f + '.csv'
39
+ else :
40
+ csvPathname = importFolderPath + "/" + f + "/*linked*"
41
+ hex_key = f + '.hex'
42
+ hK = folderPath + "Header.csv"
43
+ headerPathname = importFolderPath + "/" + hK
44
+ h2i .import_only (bucket = 'home-0xdiag-datasets' , path = headerPathname )
45
+ headerKey = h2i .find_key (hK )
46
+ trainParseWallStart = time .time ()
47
+ if f in (['AirlinesTrain10x' , 'AirlinesTrain100x' ]): h2o .beta_features = False #regex parsing acting weird when not using browser, use VA -> FVEC converter
48
+ parseResult = h2i .import_parse (bucket = 'home-0xdiag-datasets' ,
49
+ path = csvPathname ,
50
+ schema = 'local' ,
51
+ hex_key = hex_key ,
52
+ header = 1 ,
53
+ header_from_file = headerKey ,
54
+ separator = 44 ,
55
+ timeoutSecs = 7200 ,
56
+ retryDelaySecs = 5 ,
57
+ pollTimeoutSecs = 7200 ,
58
+ noPoll = True ,
59
+ doSummary = False
60
+ )
61
+ h2o_jobs .pollWaitJobs (timeoutSecs = 7200 , pollTimeoutSecs = 7200 , retryDelaySecs = 5 )
62
+ parseWallTime = time .time () - trainParseWallStart
63
+ print "Parsing training file took " , parseWallTime ," seconds."
64
+ #h2o.beta_features = True
65
+ inspect_train = h2o .nodes [0 ].inspect (hex_key , timeoutSecs = 7200 )
66
+ inspect_test = h2o .nodes [0 ].inspect (testFilehex , timeoutSecs = 7200 )
67
+ h2o .beta_features = True
68
+ nMachines = 1 if len (h2o_hosts .hosts ) is 0 else len (h2o_hosts .hosts )
69
+ row .update ( {'h2o_build' : build ,
70
+ 'nMachines' : nMachines ,
71
+ 'nJVMs' : len (h2o .nodes ),
72
+ 'Xmx/JVM' : java_heap_GB ,
73
+ 'dataset' : f ,
74
+ 'nTrainRows' : inspect_train ['num_rows' ],
75
+ 'nTestRows' : inspect_test ['num_rows' ],
76
+ 'nCols' : inspect_train ['num_cols' ],
77
+ 'trainParseWallTime' : parseWallTime ,
78
+ 'classification' : classification ,
79
+ })
80
+
81
+ params = {'destination_key' : 'GBM(' + f + ')' ,
82
+ 'response' : response ,
83
+ 'ignored_cols_by_name' : ignored_cols ,
84
+ 'classification' : classification ,
85
+ 'validation' : testFilehex ,
86
+ 'ntrees' : ntrees ,
87
+ 'max_depth' : depth ,
88
+ 'min_rows' : minrows ,
89
+ 'nbins' : nbins ,
90
+ 'learn_rate' : learnRate ,
91
+ }
94
92
95
- parseResult = {'destination_key' : hex_key }
96
- kwargs = params .copy ()
97
- gbmStart = time .time ()
98
- #TODO(spencer): Uses jobs to poll for gbm completion
99
- gbm = h2o_cmd .runGBM (parseResult = parseResult , noPoll = True , timeoutSecs = 4800 , ** kwargs )
100
- h2o_jobs .pollWaitJobs (timeoutSecs = 7200 , pollTimeoutSecs = 120 , retryDelaySecs = 5 )
101
- gbmTime = time .time () - gbmStart
102
- row .update ( {'gbmBuildTime' : gbmTime ,
103
- })
104
- gbmTrainView = h2o_cmd .runGBMView (model_key = 'GBM(' + f + ')' )
105
- if classification :
106
- cm = gbmTrainView ['gbm_model' ]['cm' ]
107
- err = 1.0 * (cm [0 ][1 ] + cm [1 ][0 ]) / (cm [0 ][0 ] + cm [0 ][1 ] + cm [1 ][0 ] + cm [1 ][1 ])
108
- else :
109
- err = gbmTrainView ['gbm_model' ]['errs' ][- 1 ]
110
- row .update ({'Error' : err })
111
- csvWrt .writerow (row )
112
- finally :
113
- output .close ()
93
+ parseResult = {'destination_key' : hex_key }
94
+ kwargs = params .copy ()
95
+ gbmStart = time .time ()
96
+ #TODO(spencer): Uses jobs to poll for gbm completion
97
+ gbm = h2o_cmd .runGBM (parseResult = parseResult , noPoll = True , timeoutSecs = 4800 , ** kwargs )
98
+ h2o_jobs .pollWaitJobs (timeoutSecs = 7200 , pollTimeoutSecs = 120 , retryDelaySecs = 5 )
99
+ gbmTime = time .time () - gbmStart
100
+ row .update ( {'gbmBuildTime' : gbmTime ,
101
+ })
102
+ gbmTrainView = h2o_cmd .runGBMView (model_key = 'GBM(' + f + ')' )
103
+ if classification :
104
+ cm = gbmTrainView ['gbm_model' ]['cm' ]
105
+ err = 1.0 * (cm [0 ][1 ] + cm [1 ][0 ]) / (cm [0 ][0 ] + cm [0 ][1 ] + cm [1 ][0 ] + cm [1 ][1 ])
106
+ else :
107
+ err = gbmTrainView ['gbm_model' ]['errs' ][- 1 ]
108
+ row .update ({'Error' : err })
109
+ csvWrt .writerow (row )
110
+ finally :
111
+ output .close ()
114
112
115
113
if __name__ == '__main__' :
114
+ dat = sys .argv .pop (- 1 )
116
115
debug = sys .argv .pop (- 1 )
117
116
build = sys .argv .pop (- 1 )
118
117
h2o .parse_our_args ()
119
118
h2o_hosts .build_cloud_with_hosts (enable_benchmark_log = False )
119
+ fp = 'Airlines' if 'Air' in dat else 'AllBedrooms'
120
120
bench = "bench"
121
121
h2o .beta_features = True
122
122
if debug :
123
- bench = "bench/debug"
124
- #AIRLINES
125
- airlinesTestParseStart = time .time ()
126
- hK = "AirlinesHeader.csv"
127
- headerPathname = bench + "/Airlines" + "/" + hK
128
- h2i .import_only (bucket = 'home-0xdiag-datasets' , path = headerPathname )
129
- headerKey = h2i .find_key (hK )
130
- testFile = h2i .import_parse (bucket = 'home-0xdiag-datasets' , path = bench + '/Airlines/AirlinesTest.csv' , schema = 'local' , hex_key = "atest.hex" , header = 1 , header_from_file = headerKey , separator = 44 , noPoll = True ,doSummary = False )
131
- h2o_jobs .pollWaitJobs (timeoutSecs = 7200 , pollTimeoutSecs = 7200 , retryDelaySecs = 5 )
132
- elapsedAirlinesTestParse = time .time () - airlinesTestParseStart
133
- row = {'testParseWallTime' : elapsedAirlinesTestParse }
134
- response = 'IsDepDelayed'
135
- ignored = None
136
- doGBM (files ['Airlines' ], folderPath = 'Airlines' ,
137
- ignored_cols = ignored ,
138
- classification = 1 ,
139
- testFilehex = 'atest.hex' ,
140
- ntrees = 100 ,
141
- depth = 5 ,
142
- minrows = 10 ,
143
- nbins = 100 ,
144
- learnRate = 0.01 ,
145
- response = response ,
146
- row = row
147
- )
123
+ bench = "bench/debug"
124
+
125
+ if dat == 'Air1x' : fs = files ['Airlines' ]['train' ][0 ]
126
+ if dat == 'Air10x' : fs = files ['Airlines' ]['train' ][1 ]
127
+ if dat == 'Air100x' : fs = files ['Airlines' ]['train' ][2 ]
128
+ if dat == 'AllB1x' : fs = files ['AllBedrooms' ]['train' ][0 ]
129
+ if dat == 'AllB10x' : fs = files ['AllBedrooms' ]['train' ][1 ]
130
+ if dat == 'AllB100x' : fs = files ['AllBedrooms' ]['train' ][2 ]
131
+
132
+ if fp == "Airlines" :
133
+ #AIRLINES
134
+ airlinesTestParseStart = time .time ()
135
+ hK = "AirlinesHeader.csv"
136
+ headerPathname = bench + "/Airlines" + "/" + hK
137
+ h2i .import_only (bucket = 'home-0xdiag-datasets' , path = headerPathname )
138
+ headerKey = h2i .find_key (hK )
139
+ testFile = h2i .import_parse (bucket = 'home-0xdiag-datasets' , path = bench + '/Airlines/AirlinesTest.csv' , schema = 'local' , hex_key = "atest.hex" , header = 1 , header_from_file = headerKey , separator = 44 , noPoll = True ,doSummary = False )
140
+ h2o_jobs .pollWaitJobs (timeoutSecs = 7200 , pollTimeoutSecs = 7200 , retryDelaySecs = 5 )
141
+ elapsedAirlinesTestParse = time .time () - airlinesTestParseStart
142
+ row = {'testParseWallTime' : elapsedAirlinesTestParse }
143
+ response = 'IsDepDelayed'
144
+ ignored = None
145
+ doGBM (fs , fp ,
146
+ ignored_cols = ignored ,
147
+ classification = 1 ,
148
+ testFilehex = 'atest.hex' ,
149
+ ntrees = 100 ,
150
+ depth = 5 ,
151
+ minrows = 10 ,
152
+ nbins = 100 ,
153
+ learnRate = 0.01 ,
154
+ response = response ,
155
+ row = row
156
+ )
148
157
158
+ if fp == "AllBedrooms" :
159
+ #ALLBEDROOMS
160
+ allBedroomsTestParseStart = time .time ()
161
+ hK = "AllBedroomsHeader.csv"
162
+ headerPathname = bench + "/AllBedrooms" + "/" + hK
163
+ h2i .import_only (bucket = 'home-0xdiag-datasets' , path = headerPathname )
164
+ headerKey = h2i .find_key (hK )
165
+ testFile = h2i .import_parse (bucket = 'home-0xdiag-datasets' , path = bench + '/AllBedrooms/AllBedroomsTest.csv' , schema = 'local' , hex_key = "allBTest.hex" , header = 1 , header_from_file = headerKey , separator = 44 ,noPoll = True ,doSummary = False )
166
+ h2o_jobs .pollWaitJobs (timeoutSecs = 7200 , pollTimeoutSecs = 7200 , retryDelaySecs = 5 )
167
+ elapsedAllBedroomsTestParse = time .time () - allBedroomsTestParseStart
168
+ row = {'testParseWallTime' : elapsedAllBedroomsTestParse }
169
+ response = 'medrent'
170
+ ignored = None
171
+ doGBM (fs , fp ,
172
+ ignored_cols = ignored ,
173
+ classification = 0 ,
174
+ testFilehex = "allBTest.hex" ,
175
+ ntrees = 100 ,
176
+ depth = 5 ,
177
+ minrows = 10 ,
178
+ nbins = 100 ,
179
+ learnRate = 0.01 ,
180
+ response = response ,
181
+ row = row
182
+ )
183
+
149
184
#COVTYPE
150
185
#covTypeTestParseStart = time.time()
151
186
#hK = "CovTypeHeader.csv"
@@ -170,57 +205,6 @@ def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, dep
170
205
# response = response,
171
206
# row = row
172
207
# )
173
-
174
- #ALLBEDROOMS
175
- allBedroomsTestParseStart = time .time ()
176
- hK = "AllBedroomsHeader.csv"
177
- headerPathname = bench + "/AllBedrooms" + "/" + hK
178
- h2i .import_only (bucket = 'home-0xdiag-datasets' , path = headerPathname )
179
- headerKey = h2i .find_key (hK )
180
- testFile = h2i .import_parse (bucket = 'home-0xdiag-datasets' , path = bench + '/AllBedrooms/AllBedroomsTest.csv' , schema = 'local' , hex_key = "allBTest.hex" , header = 1 , header_from_file = headerKey , separator = 44 ,noPoll = True ,doSummary = False )
181
- h2o_jobs .pollWaitJobs (timeoutSecs = 7200 , pollTimeoutSecs = 7200 , retryDelaySecs = 5 )
182
- elapsedAllBedroomsTestParse = time .time () - allBedroomsTestParseStart
183
- row = {'testParseWallTime' : elapsedAllBedroomsTestParse }
184
- response = 'medrent'
185
- ignored = None
186
- doGBM (files ['AllBedrooms' ], folderPath = 'AllBedrooms' ,
187
- ignored_cols = ignored ,
188
- classification = 0 ,
189
- testFilehex = "allBTest.hex" ,
190
- ntrees = 100 ,
191
- depth = 5 ,
192
- minrows = 10 ,
193
- nbins = 100 ,
194
- learnRate = 0.01 ,
195
- response = response ,
196
- row = row
197
- )
198
- ##################Do Airlines100 here
199
- if debug :
200
- bench = "bench/debug"
201
- #AIRLINES
202
- airlinesTestParseStart = time .time ()
203
- hK = "AirlinesHeader.csv"
204
- headerPathname = bench + "/Airlines" + "/" + hK
205
- h2i .import_only (bucket = 'home-0xdiag-datasets' , path = headerPathname )
206
- headerKey = h2i .find_key (hK )
207
- testFile = h2i .import_parse (bucket = 'home-0xdiag-datasets' , path = bench + '/Airlines/AirlinesTest.csv' , schema = 'local' , hex_key = "atest.hex" , header = 1 , header_from_file = headerKey , separator = 44 , noPoll = True ,doSummary = False )
208
- h2o_jobs .pollWaitJobs (timeoutSecs = 7200 , pollTimeoutSecs = 7200 , retryDelaySecs = 5 )
209
- elapsedAirlinesTestParse = time .time () - airlinesTestParseStart
210
- row = {'testParseWallTime' : elapsedAirlinesTestParse }
211
- response = 'IsDepDelayed'
212
- ignored = None
213
- doGBM (files ['Airlines100' ], folderPath = 'Airlines' ,
214
- ignored_cols = ignored ,
215
- classification = 1 ,
216
- testFilehex = 'atest.hex' ,
217
- ntrees = 100 ,
218
- depth = 5 ,
219
- minrows = 10 ,
220
- nbins = 100 ,
221
- learnRate = 0.01 ,
222
- response = response ,
223
- row = row
224
- )
208
+
225
209
226
210
h2o .tear_down_cloud ()
0 commit comments