forked from h2oai/h2o-2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_parse_rand_schmoo.py
96 lines (75 loc) · 3.12 KB
/
test_parse_rand_schmoo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os, json, unittest, time, shutil, sys
sys.path.extend(['.','..','py'])
import h2o, h2o_cmd
import h2o_browse as h2b
import random
def write_syn_dataset(csvPathname, rowCount, headerData, rowData):
dsf = open(csvPathname, "w+")
dsf.write(headerData + "\n")
for i in range(rowCount):
dsf.write(rowData + "\n")
dsf.close()
# append!
def append_syn_dataset(csvPathname, rowData, num):
with open(csvPathname, "a") as dsf:
for i in range(num):
dsf.write(rowData + "\n")
def rand_rowData():
# UPDATE: maybe because of byte buffer boundary issues, single byte
# data is best? if we put all 0s or 1, then I guess it will be bits?
rowData = str(random.uniform(0,7))
for i in range(8):
rowData = rowData + "," + str(random.uniform(-1e59,1e59))
return rowData
class parse_rand_schmoo(unittest.TestCase):
def tearDown(self):
h2o.check_sandbox_for_errors()
@classmethod
def setUpClass(cls):
localhost = h2o.decide_if_localhost()
if (localhost):
h2o.build_cloud(2,java_heap_GB=10,use_flatfile=True)
else:
import h2o_hosts
h2o_hosts.build_cloud_with_hosts()
h2b.browseTheCloud()
@classmethod
def tearDownClass(cls):
if not h2o.browse_disable:
# time.sleep(500000)
pass
h2o.tear_down_cloud(h2o.nodes)
def test_sort_of_prostate_with_row_schmoo(self):
SEED = random.randint(0, sys.maxint)
# if you have to force to redo a test
# SEED =
random.seed(SEED)
print "\nUsing random seed:", SEED
SYNDATASETS_DIR = h2o.make_syn_dir()
csvFilename = "syn_prostate.csv"
csvPathname = SYNDATASETS_DIR + '/' + csvFilename
headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"
rowData = rand_rowData()
totalRows = 1000000
write_syn_dataset(csvPathname, totalRows, headerData, rowData)
print "This is the same format/data file used by test_same_parse, but the non-gzed version"
print "\nSchmoo the # of rows"
for trial in range (100):
rowData = rand_rowData()
num = random.randint(4096, 10096)
append_syn_dataset(csvPathname, rowData, num)
totalRows += num
start = time.time()
# make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
key = csvFilename + "_" + str(trial)
key2 = csvFilename + "_" + str(trial) + ".hex"
# On EC2 once we get to 30 trials or so, do we see polling hang? GC or spill of heap or ??
key = h2o_cmd.parseFile(csvPathname=csvPathname, key=key, key2=key2,
timeoutSecs=150, pollTimeoutSecs=150)
print "trial #", trial, "totalRows:", totalRows, "num:", num, "parse end on ", csvFilename, \
'took', time.time() - start, 'seconds'
### h2o_cmd.runInspect(key=key2)
### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
h2o.check_sandbox_for_errors()
if __name__ == '__main__':
h2o.unit_main()