Skip to content

Commit 44e0b3d

Browse files
committed
tear/build between data set size
1 parent 23703ba commit 44e0b3d

File tree

3 files changed

+216
-238
lines changed

3 files changed

+216
-238
lines changed

bench/BMscripts/gbmBench.py

+152-168
Original file line numberDiff line numberDiff line change
@@ -8,144 +8,179 @@
88

99
files = {'Airlines' : {'train': ('AirlinesTrain1x', 'AirlinesTrain10x', 'AirlinesTrain100x'), 'test' : 'AirlinesTest'},
1010
'AllBedrooms': {'train': ('AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x'), 'test' : 'AllBedroomsTest'},
11-
'Airlines100' : {'train': ('AirlinesTrain100x'), 'test' : 'AirlinesTest'},
1211
'Covtype' : {'train': ('CovTypeTrain1x', 'CovTypeTrain10x', 'CovTypeTrain100x'), 'test' : 'CovTypeTest'},
1312
}
1413
build = ""
1514
debug = False
16-
def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, depth, minrows, nbins, learnRate, response, row):
15+
def doGBM(f, folderPath, ignored_cols, classification, testFilehex, ntrees, depth, minrows, nbins, learnRate, response, row):
1716
debug = False
1817
h2o.beta_features = True
1918
bench = "bench"
2019
if debug:
2120
print "Doing GBM DEBUG"
2221
bench = "bench/debug"
2322
date = '-'.join([str(x) for x in list(time.localtime())][0:3])
24-
for f in fs['train']:
25-
overallWallStart = time.time()
26-
pre = ""
27-
if debug: pre = 'DEBUG'
28-
gbmbenchcsv = 'benchmarks/'+build+'/'+date+'/'+pre+'gbmbench.csv'
29-
if not os.path.exists(gbmbenchcsv):
30-
output = open(gbmbenchcsv,'w')
31-
output.write(','.join(csv_header)+'\n')
32-
else:
33-
output = open(gbmbenchcsv,'a')
34-
csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None,
35-
dialect='excel', extrasaction='ignore',delimiter=',')
36-
try:
37-
java_heap_GB = h2o.nodes[0].java_heap_GB
38-
importFolderPath = bench + "/" + folderPath
39-
if (f in ['AirlinesTrain1x','AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x','CovTypeTrain1x', 'CovTypeTrain10x', 'CovTypeTrain100x']):
40-
csvPathname = importFolderPath + "/" + f + '.csv'
41-
else:
42-
csvPathname = importFolderPath + "/" + f + "/*linked*"
43-
hex_key = f + '.hex'
44-
hK = folderPath + "Header.csv"
45-
headerPathname = importFolderPath + "/" + hK
46-
h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
47-
headerKey = h2i.find_key(hK)
48-
trainParseWallStart = time.time()
49-
if f in (['AirlinesTrain10x', 'AirlinesTrain100x']): h2o.beta_features = False #regex parsing acting weird when not using browser, use VA -> FVEC converter
50-
parseResult = h2i.import_parse(bucket = 'home-0xdiag-datasets',
51-
path = csvPathname,
52-
schema = 'local',
53-
hex_key = hex_key,
54-
header = 1,
55-
header_from_file = headerKey,
56-
separator = 44,
57-
timeoutSecs = 7200,
58-
retryDelaySecs = 5,
59-
pollTimeoutSecs = 7200,
60-
noPoll = True,
61-
doSummary = False
62-
)
63-
h2o_jobs.pollWaitJobs(timeoutSecs=7200, pollTimeoutSecs=7200, retryDelaySecs=5)
64-
parseWallTime = time.time() - trainParseWallStart
65-
print "Parsing training file took ", parseWallTime ," seconds."
66-
#h2o.beta_features = True
67-
inspect_train = h2o.nodes[0].inspect(hex_key, timeoutSecs=7200)
68-
inspect_test = h2o.nodes[0].inspect(testFilehex, timeoutSecs=7200)
69-
h2o.beta_features = True
70-
nMachines = 1 if len(h2o_hosts.hosts) is 0 else len(h2o_hosts.hosts)
71-
row.update( {'h2o_build' : build,
72-
'nMachines' : nMachines,
73-
'nJVMs' : len(h2o.nodes),
74-
'Xmx/JVM' : java_heap_GB,
75-
'dataset' : f,
76-
'nTrainRows' : inspect_train['num_rows'],
77-
'nTestRows' : inspect_test['num_rows'],
78-
'nCols' : inspect_train['num_cols'],
79-
'trainParseWallTime' : parseWallTime,
80-
'classification' : classification,
81-
})
82-
83-
params = {'destination_key' : 'GBM('+f+')',
84-
'response' : response,
85-
'ignored_cols_by_name' : ignored_cols,
86-
'classification' : classification,
87-
'validation' : testFilehex,
88-
'ntrees' : ntrees,
89-
'max_depth' : depth,
90-
'min_rows' : minrows,
91-
'nbins' : nbins,
92-
'learn_rate' : learnRate,
93-
}
23+
overallWallStart = time.time()
24+
pre = ""
25+
if debug: pre = 'DEBUG'
26+
gbmbenchcsv = 'benchmarks/'+build+'/'+date+'/'+pre+'gbmbench.csv'
27+
if not os.path.exists(gbmbenchcsv):
28+
output = open(gbmbenchcsv,'w')
29+
output.write(','.join(csv_header)+'\n')
30+
else:
31+
output = open(gbmbenchcsv,'a')
32+
csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None,
33+
dialect='excel', extrasaction='ignore',delimiter=',')
34+
try:
35+
java_heap_GB = h2o.nodes[0].java_heap_GB
36+
importFolderPath = bench + "/" + folderPath
37+
if (f in ['AirlinesTrain1x','AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x','CovTypeTrain1x', 'CovTypeTrain10x', 'CovTypeTrain100x']):
38+
csvPathname = importFolderPath + "/" + f + '.csv'
39+
else:
40+
csvPathname = importFolderPath + "/" + f + "/*linked*"
41+
hex_key = f + '.hex'
42+
hK = folderPath + "Header.csv"
43+
headerPathname = importFolderPath + "/" + hK
44+
h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
45+
headerKey = h2i.find_key(hK)
46+
trainParseWallStart = time.time()
47+
if f in (['AirlinesTrain10x', 'AirlinesTrain100x']): h2o.beta_features = False #regex parsing acting weird when not using browser, use VA -> FVEC converter
48+
parseResult = h2i.import_parse(bucket = 'home-0xdiag-datasets',
49+
path = csvPathname,
50+
schema = 'local',
51+
hex_key = hex_key,
52+
header = 1,
53+
header_from_file = headerKey,
54+
separator = 44,
55+
timeoutSecs = 7200,
56+
retryDelaySecs = 5,
57+
pollTimeoutSecs = 7200,
58+
noPoll = True,
59+
doSummary = False
60+
)
61+
h2o_jobs.pollWaitJobs(timeoutSecs=7200, pollTimeoutSecs=7200, retryDelaySecs=5)
62+
parseWallTime = time.time() - trainParseWallStart
63+
print "Parsing training file took ", parseWallTime ," seconds."
64+
#h2o.beta_features = True
65+
inspect_train = h2o.nodes[0].inspect(hex_key, timeoutSecs=7200)
66+
inspect_test = h2o.nodes[0].inspect(testFilehex, timeoutSecs=7200)
67+
h2o.beta_features = True
68+
nMachines = 1 if len(h2o_hosts.hosts) is 0 else len(h2o_hosts.hosts)
69+
row.update( {'h2o_build' : build,
70+
'nMachines' : nMachines,
71+
'nJVMs' : len(h2o.nodes),
72+
'Xmx/JVM' : java_heap_GB,
73+
'dataset' : f,
74+
'nTrainRows' : inspect_train['num_rows'],
75+
'nTestRows' : inspect_test['num_rows'],
76+
'nCols' : inspect_train['num_cols'],
77+
'trainParseWallTime' : parseWallTime,
78+
'classification' : classification,
79+
})
80+
81+
params = {'destination_key' : 'GBM('+f+')',
82+
'response' : response,
83+
'ignored_cols_by_name' : ignored_cols,
84+
'classification' : classification,
85+
'validation' : testFilehex,
86+
'ntrees' : ntrees,
87+
'max_depth' : depth,
88+
'min_rows' : minrows,
89+
'nbins' : nbins,
90+
'learn_rate' : learnRate,
91+
}
9492

95-
parseResult = {'destination_key' : hex_key}
96-
kwargs = params.copy()
97-
gbmStart = time.time()
98-
#TODO(spencer): Uses jobs to poll for gbm completion
99-
gbm = h2o_cmd.runGBM(parseResult = parseResult, noPoll=True, timeoutSecs=4800, **kwargs)
100-
h2o_jobs.pollWaitJobs(timeoutSecs=7200, pollTimeoutSecs=120, retryDelaySecs=5)
101-
gbmTime = time.time() - gbmStart
102-
row.update( {'gbmBuildTime' : gbmTime,
103-
})
104-
gbmTrainView = h2o_cmd.runGBMView(model_key='GBM('+f+')')
105-
if classification:
106-
cm = gbmTrainView['gbm_model']['cm']
107-
err = 1.0*(cm[0][1] + cm[1][0]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1])
108-
else:
109-
err = gbmTrainView['gbm_model']['errs'][-1]
110-
row.update({'Error' : err})
111-
csvWrt.writerow(row)
112-
finally:
113-
output.close()
93+
parseResult = {'destination_key' : hex_key}
94+
kwargs = params.copy()
95+
gbmStart = time.time()
96+
#TODO(spencer): Uses jobs to poll for gbm completion
97+
gbm = h2o_cmd.runGBM(parseResult = parseResult, noPoll=True, timeoutSecs=4800, **kwargs)
98+
h2o_jobs.pollWaitJobs(timeoutSecs=7200, pollTimeoutSecs=120, retryDelaySecs=5)
99+
gbmTime = time.time() - gbmStart
100+
row.update( {'gbmBuildTime' : gbmTime,
101+
})
102+
gbmTrainView = h2o_cmd.runGBMView(model_key='GBM('+f+')')
103+
if classification:
104+
cm = gbmTrainView['gbm_model']['cm']
105+
err = 1.0*(cm[0][1] + cm[1][0]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1])
106+
else:
107+
err = gbmTrainView['gbm_model']['errs'][-1]
108+
row.update({'Error' : err})
109+
csvWrt.writerow(row)
110+
finally:
111+
output.close()
114112

115113
if __name__ == '__main__':
114+
dat = sys.argv.pop(-1)
116115
debug = sys.argv.pop(-1)
117116
build = sys.argv.pop(-1)
118117
h2o.parse_our_args()
119118
h2o_hosts.build_cloud_with_hosts(enable_benchmark_log=False)
119+
fp = 'Airlines' if 'Air' in dat else 'AllBedrooms'
120120
bench = "bench"
121121
h2o.beta_features = True
122122
if debug:
123-
bench = "bench/debug"
124-
#AIRLINES
125-
airlinesTestParseStart = time.time()
126-
hK = "AirlinesHeader.csv"
127-
headerPathname = bench+"/Airlines" + "/" + hK
128-
h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
129-
headerKey = h2i.find_key(hK)
130-
testFile = h2i.import_parse(bucket='home-0xdiag-datasets', path=bench+'/Airlines/AirlinesTest.csv', schema='local', hex_key="atest.hex", header=1, header_from_file=headerKey, separator=44, noPoll=True,doSummary=False)
131-
h2o_jobs.pollWaitJobs(timeoutSecs=7200, pollTimeoutSecs=7200, retryDelaySecs=5)
132-
elapsedAirlinesTestParse = time.time() - airlinesTestParseStart
133-
row = {'testParseWallTime' : elapsedAirlinesTestParse}
134-
response = 'IsDepDelayed'
135-
ignored = None
136-
doGBM(files['Airlines'], folderPath='Airlines',
137-
ignored_cols = ignored,
138-
classification = 1,
139-
testFilehex = 'atest.hex',
140-
ntrees = 100,
141-
depth = 5,
142-
minrows = 10,
143-
nbins = 100,
144-
learnRate = 0.01,
145-
response = response,
146-
row = row
147-
)
123+
bench = "bench/debug"
124+
125+
if dat == 'Air1x' : fs = files['Airlines']['train'][0]
126+
if dat == 'Air10x' : fs = files['Airlines']['train'][1]
127+
if dat == 'Air100x' : fs = files['Airlines']['train'][2]
128+
if dat == 'AllB1x' : fs = files['AllBedrooms']['train'][0]
129+
if dat == 'AllB10x' : fs = files['AllBedrooms']['train'][1]
130+
if dat == 'AllB100x' : fs = files['AllBedrooms']['train'][2]
131+
132+
if fp == "Airlines":
133+
#AIRLINES
134+
airlinesTestParseStart = time.time()
135+
hK = "AirlinesHeader.csv"
136+
headerPathname = bench+"/Airlines" + "/" + hK
137+
h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
138+
headerKey = h2i.find_key(hK)
139+
testFile = h2i.import_parse(bucket='home-0xdiag-datasets', path=bench+'/Airlines/AirlinesTest.csv', schema='local', hex_key="atest.hex", header=1, header_from_file=headerKey, separator=44, noPoll=True,doSummary=False)
140+
h2o_jobs.pollWaitJobs(timeoutSecs=7200, pollTimeoutSecs=7200, retryDelaySecs=5)
141+
elapsedAirlinesTestParse = time.time() - airlinesTestParseStart
142+
row = {'testParseWallTime' : elapsedAirlinesTestParse}
143+
response = 'IsDepDelayed'
144+
ignored = None
145+
doGBM(fs, fp,
146+
ignored_cols = ignored,
147+
classification = 1,
148+
testFilehex = 'atest.hex',
149+
ntrees = 100,
150+
depth = 5,
151+
minrows = 10,
152+
nbins = 100,
153+
learnRate = 0.01,
154+
response = response,
155+
row = row
156+
)
148157

158+
if fp == "AllBedrooms":
159+
#ALLBEDROOMS
160+
allBedroomsTestParseStart = time.time()
161+
hK = "AllBedroomsHeader.csv"
162+
headerPathname = bench+"/AllBedrooms" + "/" + hK
163+
h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
164+
headerKey = h2i.find_key(hK)
165+
testFile = h2i.import_parse(bucket='home-0xdiag-datasets', path=bench+'/AllBedrooms/AllBedroomsTest.csv', schema='local', hex_key="allBTest.hex", header=1, header_from_file=headerKey, separator=44,noPoll=True,doSummary=False)
166+
h2o_jobs.pollWaitJobs(timeoutSecs=7200, pollTimeoutSecs=7200, retryDelaySecs=5)
167+
elapsedAllBedroomsTestParse = time.time() - allBedroomsTestParseStart
168+
row = {'testParseWallTime' : elapsedAllBedroomsTestParse}
169+
response = 'medrent'
170+
ignored = None
171+
doGBM(fs, fp,
172+
ignored_cols = ignored,
173+
classification = 0,
174+
testFilehex = "allBTest.hex",
175+
ntrees = 100,
176+
depth = 5,
177+
minrows = 10,
178+
nbins = 100,
179+
learnRate = 0.01,
180+
response = response,
181+
row = row
182+
)
183+
149184
#COVTYPE
150185
#covTypeTestParseStart = time.time()
151186
#hK = "CovTypeHeader.csv"
@@ -170,57 +205,6 @@ def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, dep
170205
# response = response,
171206
# row = row
172207
# )
173-
174-
#ALLBEDROOMS
175-
allBedroomsTestParseStart = time.time()
176-
hK = "AllBedroomsHeader.csv"
177-
headerPathname = bench+"/AllBedrooms" + "/" + hK
178-
h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
179-
headerKey = h2i.find_key(hK)
180-
testFile = h2i.import_parse(bucket='home-0xdiag-datasets', path=bench+'/AllBedrooms/AllBedroomsTest.csv', schema='local', hex_key="allBTest.hex", header=1, header_from_file=headerKey, separator=44,noPoll=True,doSummary=False)
181-
h2o_jobs.pollWaitJobs(timeoutSecs=7200, pollTimeoutSecs=7200, retryDelaySecs=5)
182-
elapsedAllBedroomsTestParse = time.time() - allBedroomsTestParseStart
183-
row = {'testParseWallTime' : elapsedAllBedroomsTestParse}
184-
response = 'medrent'
185-
ignored = None
186-
doGBM(files['AllBedrooms'], folderPath='AllBedrooms',
187-
ignored_cols = ignored,
188-
classification = 0,
189-
testFilehex = "allBTest.hex",
190-
ntrees = 100,
191-
depth = 5,
192-
minrows = 10,
193-
nbins = 100,
194-
learnRate = 0.01,
195-
response = response,
196-
row = row
197-
)
198-
##################Do Airlines100 here
199-
if debug:
200-
bench = "bench/debug"
201-
#AIRLINES
202-
airlinesTestParseStart = time.time()
203-
hK = "AirlinesHeader.csv"
204-
headerPathname = bench+"/Airlines" + "/" + hK
205-
h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname)
206-
headerKey = h2i.find_key(hK)
207-
testFile = h2i.import_parse(bucket='home-0xdiag-datasets', path=bench+'/Airlines/AirlinesTest.csv', schema='local', hex_key="atest.hex", header=1, header_from_file=headerKey, separator=44, noPoll=True,doSummary=False)
208-
h2o_jobs.pollWaitJobs(timeoutSecs=7200, pollTimeoutSecs=7200, retryDelaySecs=5)
209-
elapsedAirlinesTestParse = time.time() - airlinesTestParseStart
210-
row = {'testParseWallTime' : elapsedAirlinesTestParse}
211-
response = 'IsDepDelayed'
212-
ignored = None
213-
doGBM(files['Airlines100'], folderPath='Airlines',
214-
ignored_cols = ignored,
215-
classification = 1,
216-
testFilehex = 'atest.hex',
217-
ntrees = 100,
218-
depth = 5,
219-
minrows = 10,
220-
nbins = 100,
221-
learnRate = 0.01,
222-
response = response,
223-
row = row
224-
)
208+
225209

226210
h2o.tear_down_cloud()

0 commit comments

Comments
 (0)