forked from h2oai/h2o-2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_GLM2_many_cols_enum.py
146 lines (120 loc) · 5.1 KB
/
test_GLM2_many_cols_enum.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import unittest, random, sys, time, getpass
sys.path.extend(['.','..','py'])
# FIX! add cases with shuffled data!
import h2o, h2o_cmd, h2o_hosts, h2o_glm
import h2o_browse as h2b, h2o_import as h2i, h2o_exec as h2e
def write_syn_dataset(csvPathname, rowCount, colCount, SEED, translateList):
# do we need more than one random generator?
r1 = random.Random(SEED)
dsf = open(csvPathname, "w+")
for i in range(rowCount):
rowData = []
for j in range(colCount):
### ri1 = int(r1.triangular(0,2,1.5))
ri1 = int(r1.triangular(1,5,2.5))
rowData.append(ri1)
rowTotal = sum(rowData)
### print rowData
if translateList is not None:
for i, iNum in enumerate(rowData):
# numbers should be 1-5, mapping to a-d
rowData[i] = translateList[iNum-1]
rowAvg = (rowTotal + 0.0)/colCount
### print rowAvg
# if rowAvg > 2.25:
if rowAvg > 2.3:
result = 1
else:
result = 0
### print colCount, rowTotal, result
rowDataStr = map(str,rowData)
rowDataStr.append(str(result))
rowDataCsv = ",".join(rowDataStr)
dsf.write(rowDataCsv + "\n")
dsf.close()
class Basic(unittest.TestCase):
def tearDown(self):
h2o.check_sandbox_for_errors()
@classmethod
def setUpClass(cls):
global SEED, localhost, tryHeap
tryHeap = 14
SEED = h2o.setup_random_seed()
localhost = h2o.decide_if_localhost()
if (localhost):
h2o.build_cloud(1, enable_benchmark_log=True, java_heap_GB=tryHeap)
else:
h2o_hosts.build_cloud_with_hosts(enable_benchmark_log=True)
@classmethod
def tearDownClass(cls):
h2o.tear_down_cloud()
def test_GLM2_many_cols_enum(self):
h2o.beta_features = True
SYNDATASETS_DIR = h2o.make_syn_dir()
translateList = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u']
if getpass.getuser() == 'kevin': # longer run
tryList = [
(10000, 100, 'cA', 100),
(10000, 300, 'cB', 300),
(10000, 500, 'cC', 700),
(10000, 700, 'cD', 3600),
(10000, 900, 'cE', 3600),
(10000, 1000, 'cF', 3600),
(10000, 1300, 'cG', 3600),
(10000, 1700, 'cH', 3600),
(10000, 2000, 'cI', 3600),
(10000, 2500, 'cJ', 3600),
(10000, 3000, 'cK', 3600),
]
else:
tryList = [
(10000, 100, 'cA', 100),
(10000, 300, 'cC', 300),
]
### h2b.browseTheCloud()
for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
SEEDPERFILE = random.randint(0, sys.maxint)
csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
csvPathname = SYNDATASETS_DIR + '/' + csvFilename
print "\nCreating random", csvPathname
write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, translateList)
start = time.time()
parseResult = h2i.import_parse(path=csvPathname, hex_key=hex_key, schema='put', timeoutSecs=30)
elapsed = time.time() - start
print "Parse result['destination_key']:", parseResult['destination_key']
algo = "Parse"
l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed)
print l
h2o.cloudPerfH2O.message(l)
# We should be able to see the parse result?
inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
print "\n" + csvFilename
y = colCount
# just limit to 2 iterations..assume it scales with more iterations
# java.lang.IllegalArgumentException: Too many predictors!
# GLM can only handle 5000 predictors, got 5100, try to run with strong_rules enabled.
kwargs = {
'response': y,
'max_iter': 2,
'family': 'binomial',
'lambda': 1e-4,
'alpha': 0.6,
'n_folds': 1,
'beta_epsilon': 1e-4,
'strong_rules_enabled': 1,
}
start = time.time()
glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
elapsed = time.time() - start
h2o.check_sandbox_for_errors()
print "glm end on ", csvPathname, 'took', elapsed, 'seconds', \
"%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
algo = "GLM "
l = '{:d} jvms, {:d}GB heap, {:s} {:s} {:6.2f} secs'.format(
len(h2o.nodes), h2o.nodes[0].java_heap_GB, algo, csvFilename, elapsed)
print l
h2o.cloudPerfH2O.message(l)
if __name__ == '__main__':
h2o.unit_main()