forked from h2oai/h2o-2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_KMeans_allstate_s3n_thru_hdfs.py
57 lines (47 loc) · 2.12 KB
/
test_KMeans_allstate_s3n_thru_hdfs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import unittest, time, sys, random
sys.path.extend(['.','..','../..','py'])
import h2o, h2o_cmd, h2o_glm, h2o_kmeans, h2o_browse as h2b, h2o_import as h2i
class Basic(unittest.TestCase):
def tearDown(self):
h2o.check_sandbox_for_errors()
@classmethod
def setUpClass(cls):
# assume we're at 0xdata with it's hdfs namenode
h2o.init(1)
@classmethod
def tearDownClass(cls):
h2o.tear_down_cloud()
def test_KMeans_allstate_s3n_thru_hdfs(self):
bucket = 'home-0xdiag-datasets'
importFolderPath = 'allstate'
csvFilename = "train_set.csv"
csvPathname = importFolderPath + "/" + csvFilename
timeoutSecs = 600
trialMax = 3
for trial in range(trialMax):
trialStart = time.time()
hex_key = csvFilename + "_" + str(trial) + ".hex"
start = time.time()
parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n', hex_key=hex_key,
timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60)
elapsed = time.time() - start
print "parse end on ", csvPathname, 'took', elapsed, 'seconds',\
"%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
print "parse result:", parseResult['destination_key']
kwargs = {
'cols': None,
'initialization': 'Furthest',
'k': 12
}
start = time.time()
kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=120, **kwargs)
elapsed = time.time() - start
print "kmeans end on ", csvFilename, 'took', elapsed, 'seconds.', \
"%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key'])
print h2o.dump_json(inspect)
print "Trial #", trial, "completed in", time.time() - trialStart, "seconds.", \
if __name__ == '__main__':
h2o.unit_main()