forked from h2oai/h2o-2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_parse_rand_schmoo.py
80 lines (66 loc) · 2.93 KB
/
test_parse_rand_schmoo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import unittest, time, sys, random
sys.path.extend(['.','..','py'])
import h2o, h2o_hosts, h2o_cmd, h2o_browse as h2b, h2o_import as h2i
def write_syn_dataset(csvPathname, rowCount, headerData, rowData):
dsf = open(csvPathname, "w+")
dsf.write(headerData + "\n")
for i in range(rowCount):
dsf.write(rowData + "\n")
dsf.close()
# append!
def append_syn_dataset(csvPathname, rowData, num):
with open(csvPathname, "a") as dsf:
for i in range(num):
dsf.write(rowData + "\n")
def rand_rowData():
# UPDATE: maybe because of byte buffer boundary issues, single byte
# data is best? if we put all 0s or 1, then I guess it will be bits?
rowData = str(random.uniform(0,7))
for i in range(8):
rowData = rowData + "," + str(random.uniform(-1e59,1e59))
return rowData
class test_parse_rand_schmoo(unittest.TestCase):
def tearDown(self):
h2o.check_sandbox_for_errors()
@classmethod
def setUpClass(cls):
global SEED, localhost
SEED = h2o.setup_random_seed()
localhost = h2o.decide_if_localhost()
if (localhost):
h2o.build_cloud(2,java_heap_GB=10,use_flatfile=True)
else:
h2o_hosts.build_cloud_with_hosts()
h2b.browseTheCloud()
@classmethod
def tearDownClass(cls):
h2o.tear_down_cloud(h2o.nodes)
def test_sort_of_prostate_with_row_schmoo(self):
SYNDATASETS_DIR = h2o.make_syn_dir()
csvFilename = "syn_prostate.csv"
csvPathname = SYNDATASETS_DIR + '/' + csvFilename
headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"
rowData = rand_rowData()
totalRows = 1000000
write_syn_dataset(csvPathname, totalRows, headerData, rowData)
print "This is the same format/data file used by test_same_parse, but the non-gzed version"
print "\nSchmoo the # of rows"
# used to fail around 50 iterations..python memory problem
for trial in range (40):
rowData = rand_rowData()
num = random.randint(4096, 10096)
append_syn_dataset(csvPathname, rowData, num)
totalRows += num
start = time.time()
# make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
hex_key = csvFilename + "_" + str(trial) + ".hex"
# On EC2 once we get to 30 trials or so, do we see polling hang? GC or spill of heap or ??
parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key,
timeoutSecs=150, pollTimeoutSecs=150)
print "trial #", trial, "totalRows:", totalRows, "num:", num, "parse end on ", csvFilename, \
'took', time.time() - start, 'seconds'
### h2o_cmd.runInspect(key=hex_key)
### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
h2o.check_sandbox_for_errors()
if __name__ == '__main__':
h2o.unit_main()