forked from h2oai/h2o-2
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' of github.com:0xdata/h2o
- Loading branch information
Showing
22 changed files
with
1,542 additions
and
92 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
import unittest | ||
import random, sys, time, os | ||
sys.path.extend(['.','..','py']) | ||
import h2o, h2o_cmd, h2o_hosts, h2o_browse as h2b, h2o_import as h2i, h2o_kmeans | ||
|
||
class Basic(unittest.TestCase): | ||
def tearDown(self): | ||
h2o.check_sandbox_for_errors() | ||
|
||
@classmethod | ||
def setUpClass(cls): | ||
global SEED, localhost | ||
SEED = h2o.setup_random_seed() | ||
localhost = h2o.decide_if_localhost() | ||
if (localhost): | ||
h2o.build_cloud(3,java_heap_GB=4) | ||
else: | ||
h2o_hosts.build_cloud_with_hosts() # uses import Hdfs for s3n instead of import folder | ||
|
||
@classmethod | ||
def tearDownClass(cls): | ||
# wait while I inspect things | ||
# time.sleep(1500) | ||
h2o.tear_down_cloud() | ||
|
||
def test_KMeans_covtype_cols_fvec(self): | ||
h2o.beta_features = True | ||
# just do the import folder once | ||
# make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation) | ||
# so probably 10x that for covtype200 | ||
csvFilenameList = [ | ||
("covtype.binary.svm", "cC", 30, 1), | ||
# normal csv | ||
] | ||
|
||
### csvFilenameList = random.sample(csvFilenameAll,1) | ||
# h2b.browseTheCloud() | ||
lenNodes = len(h2o.nodes) | ||
|
||
firstDone = False | ||
importFolderPath = "libsvm" | ||
for (csvFilename, hex_key, timeoutSecs, resultMult) in csvFilenameList: | ||
# have to import each time, because h2o deletes source after parse | ||
csvPathname = importFolderPath + "/" + csvFilename | ||
|
||
# PARSE****************************************** | ||
# creates csvFilename.hex from file in importFolder dir | ||
parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, | ||
hex_key=hex_key, timeoutSecs=2000) | ||
print "Parse result['destination_key']:", parseResult['destination_key'] | ||
|
||
# INSPECT****************************************** | ||
start = time.time() | ||
inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360) | ||
print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" | ||
h2o_cmd.infoFromInspect(inspect, csvFilename) | ||
numRows = inspect['numRows'] | ||
numCols = inspect['numCols'] | ||
|
||
# KMEANS****************************************** | ||
for trial in range(1): | ||
kwargs = { | ||
'k': 3, | ||
'initialization': 'Furthest', | ||
'ignored_cols': range(11, numCols), | ||
'max_iter': 10, | ||
# 'normalize': 0, | ||
# reuse the same seed, to get deterministic results (otherwise sometimes fails | ||
'seed': 265211114317615310, | ||
} | ||
|
||
# fails if I put this in kwargs..i.e. source = dest | ||
# 'destination_key': parseResult['destination_key'], | ||
|
||
for trial2 in range(3): | ||
timeoutSecs = 600 | ||
start = time.time() | ||
kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) | ||
elapsed = time.time() - start | ||
print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \ | ||
"%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) | ||
# this does an inspect of the model and prints the clusters | ||
h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) | ||
|
||
(centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs) | ||
|
||
|
||
|
||
if __name__ == '__main__': | ||
h2o.unit_main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
import unittest, random, sys, time, json | ||
sys.path.extend(['.','..','py']) | ||
import h2o, h2o_cmd, h2o_hosts, h2o_kmeans, h2o_import as h2i, h2o_util | ||
|
||
def define_create_frame_params(SEED): | ||
paramDict = { | ||
'rows': [1, 100, 1000], | ||
'cols': [1, 10, 100], # Number of data columns (in addition to the first response column) | ||
'seed': [None, 1234], | ||
'randomize': [None, 0, 1], | ||
'value': [None, 0, 1234567890, 1e6, -1e6], # Constant value (for randomize=false) | ||
'real_range': [None, 0, 1234567890, 1e6, -1e6], # -range to range | ||
'categorical_fraction': [None, 0.1, 1.0], # Fraction of integer columns (for randomize=true) | ||
'factors': [None, 0, 1], # Factor levels for categorical variables | ||
'integer_fraction': [None, 0.1, 1.0], # Fraction of integer columns (for randomize=true) | ||
'integer_range': [None, 0, 1, 1234567890], # -range to range | ||
'missing_fraction': [None, 0.1, 1.0], | ||
'response_factors': [None, 0, 1, 2, 10], # Number of factor levels of the first column (1=real, 2=binomial, N=multinomial) | ||
} | ||
return paramDict | ||
|
||
|
||
def define_KMeans_params(SEED): | ||
paramDict = { | ||
'k': [2, 5], # seems two slow tih 12 clusters if all cols | ||
'initialization': ['None', 'PlusPlus', 'Furthest'], | ||
'ignored_cols': [None, "0", "3", "0,1,2,3,4"], | ||
'seed': [None, 12345678, SEED], | ||
'normalize': [None, 0, 1], | ||
'max_iter': [10,20,50], | ||
# 'destination_key:': "junk", | ||
|
||
} | ||
return paramDict | ||
|
||
class Basic(unittest.TestCase): | ||
def tearDown(self): | ||
h2o.check_sandbox_for_errors() | ||
|
||
@classmethod | ||
def setUpClass(cls): | ||
global SEED, localhost | ||
SEED = h2o.setup_random_seed() | ||
localhost = h2o.decide_if_localhost() | ||
if (localhost): | ||
h2o.build_cloud(3,java_heap_GB=4) | ||
else: | ||
h2o_hosts.build_cloud_with_hosts() | ||
|
||
@classmethod | ||
def tearDownClass(cls): | ||
h2o.tear_down_cloud() | ||
|
||
def test_KMeans_create_frame_fvec(self): | ||
for trial in range(20): | ||
|
||
cfParamDict = define_create_frame_params(SEED) | ||
# default | ||
params = { | ||
'rows': 1, | ||
'cols': 1 | ||
} | ||
h2o_util.pickRandParams(cfParamDict, params) | ||
i = params.get('integer_fraction', None) | ||
c = params.get('categorical_fraction', None) | ||
r = params.get('randomize', None) | ||
v = params.get('value', None) | ||
|
||
# h2o does some strict checking on the combinations of these things | ||
# fractions have to add up to <= 1 and only be used if randomize | ||
# h2o default randomize=1? | ||
if r: | ||
if not i: | ||
i = 0 | ||
if not c: | ||
c = 0 | ||
if (i and c) and (i + c) >= 1.0: | ||
c = 1.0 - i | ||
params['integer_fraction'] = i | ||
params['categorical_fraction'] = c | ||
params['value'] = None | ||
|
||
else: | ||
params['randomize'] = 0 | ||
params['integer_fraction'] = 0 | ||
params['categorical_fraction'] = 0 | ||
|
||
|
||
kwargs = params.copy() | ||
timeoutSecs = 300 | ||
hex_key = 'temp_%s.hex' % trial | ||
cfResult = h2o.nodes[0].create_frame(key=hex_key, timeoutSecs=timeoutSecs, **kwargs) | ||
inspect = h2o_cmd.runInspect(None, hex_key) | ||
print "\n%s" % hex_key, \ | ||
" numRows:", "{:,}".format(inspect['numRows']), \ | ||
" numCols:", "{:,}".format(inspect['numCols']) | ||
|
||
kmeansParamDict = define_KMeans_params(SEED) | ||
|
||
# default | ||
params = { | ||
'max_iter': 20, | ||
'k': 1, | ||
'destination_key': "KM_" + str(trial) + '.hex' | ||
} | ||
h2o_kmeans.pickRandKMeansParams(kmeansParamDict, params) | ||
kwargs = params.copy() | ||
|
||
start = time.time() | ||
parseResult = {'destination_key': hex_key } | ||
kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \ | ||
timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) | ||
elapsed = time.time() - start | ||
print "kmeans trial %s end on ", trial, 'took', elapsed, 'seconds.', \ | ||
"%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) | ||
h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) | ||
|
||
### print h2o.dump_json(kmeans) | ||
|
||
print "Trial #", trial, "completed\n" | ||
|
||
if __name__ == '__main__': | ||
h2o.unit_main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.