Skip to content

Commit

Permalink
Merge branch 'master' of github.com:0xdata/h2o
Browse files Browse the repository at this point in the history
  • Loading branch information
cliffclick committed Aug 6, 2014
2 parents 862bb28 + 7888375 commit 9e4a8d1
Show file tree
Hide file tree
Showing 22 changed files with 1,542 additions and 92 deletions.
17 changes: 3 additions & 14 deletions R/tests/testdir_algos/glm/runit_GLM_perfectSeparation_balanced.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,9 @@ source('../../findNSourceUtils.R')

test <- function(conn) {

print("Generate balanced dataset by column in R")
y.a = sample(0:0, 200, replace=T)
y.b = sample(1:1, 200, replace=T)
x1.a = sample(-1203:-1, 200, replace=T)
x1.b = sample(1:1, 200, replace=T)
x2.a = sample(0:0, 200, replace=T)
x2.b = sample(1:1203, 200, replace=T)
data.a = cbind(y.a, x1.a, x2.a)
data.b = cbind(y.b, x1.b, x2.b)
data.balanced = rbind(data.a, data.b)
colnames(data.balanced) <- c("y", "x1", "x2")

print("Read data into H20.")
data.b.hex <- as.h2o(conn, as.data.frame(data.balanced), "data.b.hex")
print("Read in synthetic balanced dataset")
data.b.hex <- h2o.uploadFile(conn, locate("smalldata/synthetic_perfect_separation/balanced.csv"), key="data.b.hex")

print("Fit model on dataset.")
model.balanced <- h2o.glm(x=c("x1", "x2"), y="y", data.b.hex, family="binomial", lambda_search=TRUE, use_all_factor_levels=1, alpha=0.5, nfolds=0, higher_accuracy=TRUE, lambda=0)
print("Check line search invoked even with higher_accuracy off")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,9 @@ source('../../findNSourceUtils.R')

test <- function(conn) {

print("Generate unbalanced dataset by column in R")
y = sample(0:0, 10000, replace=T)
x1 = sample(-1:-10, 10000, replace=T)
x2 = sample(6:10, 10000, replace=T)
data = cbind(y, x1, x2)
data.unbalanced = rbind(data, c(1, 30, 7))

print("Read data into H20.")
data.u.hex <- as.h2o(conn, as.data.frame(data.unbalanced), "data.u.hex")
print("Read in synthetic unbalanced dataset")
data.u.hex <- h2o.uploadFile(conn, locate("smalldata/synthetic_perfect_separation/unbalanced.csv"), key="data.u.hex")

print("Fit model on dataset.")
model.unbalanced <- h2o.glm(x=c("x1", "x2"), y="y", data.u.hex, family="binomial", lambda_search=TRUE, use_all_factor_levels=1, alpha=0.5, nfolds=0, higher_accuracy=TRUE, lambda=0)
print("Check line search invoked even with higher_accuracy off")
Expand Down
49 changes: 2 additions & 47 deletions R/tests/testdir_munging/binop/runit_binop2_starCol.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ doSelect<-
function() {
d <- select()
dd <- d[[1]]$ATTRS
if(any(dd$TYPES != "enum")) return(d)
if(any(dd$TYPES != "enum")) return(d)
Log.info("No numeric columns found in data, trying a different selection")
doSelect()
}
Expand All @@ -38,52 +38,19 @@ test.slice.star <- function(conn) {
if(any(dd$TYPES == "enum")) anyEnum <- TRUE

Log.info("Try adding scalar to a numeric column: 5 * hex[,col]")
#col <- sample(colnames[colTypes != "enum"], 1)
#col <- ifelse(is.na(suppressWarnings(as.numeric(col))), col, as.numeric(col) + 1)
#col <- ifelse(is.na(suppressWarnings(as.numeric(col))), col, paste("C", col, sep = "", collapse = ""))
df <- head(hex)
col <- sample(colnames(df[!sapply(df, is.factor)]), 1)
if (!(grepl("\\.", col))) {
col <- gsub("\\.", " ", sample(colnames(df[!sapply(df, is.factor)]), 1))
}

print(which(col == colnames(df)))

print(colnames(hex))
print(col)

print(col %in% colnames(hex))
print(col %in% colnames(df))

if (!(col %in% colnames(hex))) {
col <- which(col == colnames(df))
}
col <- sample(ncol(hex), 1)

Log.info(paste("Using column: ", col))

sliced <- hex[,col]
Log.info("Placing key \"sliced.hex\" into User Store")
sliced <- h2o.assign(sliced, "sliced.hex")
print(h2o.ls(conn))

Log.info("*ing 5 to sliced.hex")
slicedStarFive <- sliced * 5
slicedStarFive <- h2o.assign(slicedStarFive, "slicedStarFive.hex")

Log.info("Orignal sliced: ")
df_head <- as.data.frame(sliced)
df_head <- data.frame(apply(df_head, 1:2, toDouble))
print(head(df_head))

Log.info("Sliced * 5: ")
df_slicedStarFive <- as.data.frame(slicedStarFive)
df_slicedStarFive <- data.frame(apply(df_slicedStarFive, 1:2, toDouble))
df_sliced <- as.data.frame(sliced)
df_sliced <- data.frame(apply(df_sliced, 1:2, toDouble))
print(head(df_slicedStarFive))

expect_that(df_slicedStarFive, equals(5 * df_sliced ))

Log.info("Checking left and right: ")
slicedStarFive <- sliced * 5
fiveStarSliced <- 5 * sliced
Expand All @@ -94,14 +61,6 @@ test.slice.star <- function(conn) {
Log.info("5 * sliced: ")
print(head(fiveStarSliced))

df_slicedStarFive <- as.data.frame(slicedStarFive)
df_slicedStarFive <- data.frame(apply(df_slicedStarFive, 1:2, toDouble))
df_sliced <- as.data.frame(fiveStarSliced)
df_fiveStarSliced <- data.frame(apply(df_sliced, 1:2, toDouble))

expect_that(df_slicedStarFive, equals(df_fiveStarSliced))


Log.info("Checking the variation of H2OParsedData * H2OParsedData")

hexStarHex <- fiveStarSliced * slicedStarFive
Expand All @@ -110,10 +69,6 @@ test.slice.star <- function(conn) {
print(head(hexStarHex))

Log.info("as.data.frame(fiveStarSliced) * as.data.frame(fiveStarSliced)")


print(head(df_fiveStarSliced*df_fiveStarSliced))
expect_that(as.data.frame(hexStarHex), equals(df_fiveStarSliced*df_fiveStarSliced))

testEnd()
}
Expand Down
2 changes: 1 addition & 1 deletion py/testdir_multi_jvm/test_KMeansGrid_params_rand2_fvec.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def test_KMeansGrid_params_rand2_fvec(self):
elapsed = time.time() - start
print "FIX! how do we get results..need redirect_url"
print "Have to inspect different models? (grid)"
print "kmeans grid end on ", csvPathname, 'took', elapsed, 'seconds.', \
print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
"%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
# h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

Expand Down
90 changes: 90 additions & 0 deletions py/testdir_multi_jvm/test_KMeans_covtype_cols_fvec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import unittest
import random, sys, time, os
sys.path.extend(['.','..','py'])
import h2o, h2o_cmd, h2o_hosts, h2o_browse as h2b, h2o_import as h2i, h2o_kmeans

class Basic(unittest.TestCase):
def tearDown(self):
h2o.check_sandbox_for_errors()

@classmethod
def setUpClass(cls):
global SEED, localhost
SEED = h2o.setup_random_seed()
localhost = h2o.decide_if_localhost()
if (localhost):
h2o.build_cloud(3,java_heap_GB=4)
else:
h2o_hosts.build_cloud_with_hosts() # uses import Hdfs for s3n instead of import folder

@classmethod
def tearDownClass(cls):
# wait while I inspect things
# time.sleep(1500)
h2o.tear_down_cloud()

def test_KMeans_covtype_cols_fvec(self):
h2o.beta_features = True
# just do the import folder once
# make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
# so probably 10x that for covtype200
csvFilenameList = [
("covtype.binary.svm", "cC", 30, 1),
# normal csv
]

### csvFilenameList = random.sample(csvFilenameAll,1)
# h2b.browseTheCloud()
lenNodes = len(h2o.nodes)

firstDone = False
importFolderPath = "libsvm"
for (csvFilename, hex_key, timeoutSecs, resultMult) in csvFilenameList:
# have to import each time, because h2o deletes source after parse
csvPathname = importFolderPath + "/" + csvFilename

# PARSE******************************************
# creates csvFilename.hex from file in importFolder dir
parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname,
hex_key=hex_key, timeoutSecs=2000)
print "Parse result['destination_key']:", parseResult['destination_key']

# INSPECT******************************************
start = time.time()
inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
h2o_cmd.infoFromInspect(inspect, csvFilename)
numRows = inspect['numRows']
numCols = inspect['numCols']

# KMEANS******************************************
for trial in range(1):
kwargs = {
'k': 3,
'initialization': 'Furthest',
'ignored_cols': range(11, numCols),
'max_iter': 10,
# 'normalize': 0,
# reuse the same seed, to get deterministic results (otherwise sometimes fails
'seed': 265211114317615310,
}

# fails if I put this in kwargs..i.e. source = dest
# 'destination_key': parseResult['destination_key'],

for trial2 in range(3):
timeoutSecs = 600
start = time.time()
kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
elapsed = time.time() - start
print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
"%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
# this does an inspect of the model and prints the clusters
h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

(centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)



if __name__ == '__main__':
h2o.unit_main()
123 changes: 123 additions & 0 deletions py/testdir_multi_jvm/test_KMeans_create_frame_fvec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import unittest, random, sys, time, json
sys.path.extend(['.','..','py'])
import h2o, h2o_cmd, h2o_hosts, h2o_kmeans, h2o_import as h2i, h2o_util

def define_create_frame_params(SEED):
paramDict = {
'rows': [1, 100, 1000],
'cols': [1, 10, 100], # Number of data columns (in addition to the first response column)
'seed': [None, 1234],
'randomize': [None, 0, 1],
'value': [None, 0, 1234567890, 1e6, -1e6], # Constant value (for randomize=false)
'real_range': [None, 0, 1234567890, 1e6, -1e6], # -range to range
'categorical_fraction': [None, 0.1, 1.0], # Fraction of integer columns (for randomize=true)
'factors': [None, 0, 1], # Factor levels for categorical variables
'integer_fraction': [None, 0.1, 1.0], # Fraction of integer columns (for randomize=true)
'integer_range': [None, 0, 1, 1234567890], # -range to range
'missing_fraction': [None, 0.1, 1.0],
'response_factors': [None, 0, 1, 2, 10], # Number of factor levels of the first column (1=real, 2=binomial, N=multinomial)
}
return paramDict


def define_KMeans_params(SEED):
paramDict = {
'k': [2, 5], # seems two slow tih 12 clusters if all cols
'initialization': ['None', 'PlusPlus', 'Furthest'],
'ignored_cols': [None, "0", "3", "0,1,2,3,4"],
'seed': [None, 12345678, SEED],
'normalize': [None, 0, 1],
'max_iter': [10,20,50],
# 'destination_key:': "junk",

}
return paramDict

class Basic(unittest.TestCase):
def tearDown(self):
h2o.check_sandbox_for_errors()

@classmethod
def setUpClass(cls):
global SEED, localhost
SEED = h2o.setup_random_seed()
localhost = h2o.decide_if_localhost()
if (localhost):
h2o.build_cloud(3,java_heap_GB=4)
else:
h2o_hosts.build_cloud_with_hosts()

@classmethod
def tearDownClass(cls):
h2o.tear_down_cloud()

def test_KMeans_create_frame_fvec(self):
for trial in range(20):

cfParamDict = define_create_frame_params(SEED)
# default
params = {
'rows': 1,
'cols': 1
}
h2o_util.pickRandParams(cfParamDict, params)
i = params.get('integer_fraction', None)
c = params.get('categorical_fraction', None)
r = params.get('randomize', None)
v = params.get('value', None)

# h2o does some strict checking on the combinations of these things
# fractions have to add up to <= 1 and only be used if randomize
# h2o default randomize=1?
if r:
if not i:
i = 0
if not c:
c = 0
if (i and c) and (i + c) >= 1.0:
c = 1.0 - i
params['integer_fraction'] = i
params['categorical_fraction'] = c
params['value'] = None

else:
params['randomize'] = 0
params['integer_fraction'] = 0
params['categorical_fraction'] = 0


kwargs = params.copy()
timeoutSecs = 300
hex_key = 'temp_%s.hex' % trial
cfResult = h2o.nodes[0].create_frame(key=hex_key, timeoutSecs=timeoutSecs, **kwargs)
inspect = h2o_cmd.runInspect(None, hex_key)
print "\n%s" % hex_key, \
" numRows:", "{:,}".format(inspect['numRows']), \
" numCols:", "{:,}".format(inspect['numCols'])

kmeansParamDict = define_KMeans_params(SEED)

# default
params = {
'max_iter': 20,
'k': 1,
'destination_key': "KM_" + str(trial) + '.hex'
}
h2o_kmeans.pickRandKMeansParams(kmeansParamDict, params)
kwargs = params.copy()

start = time.time()
parseResult = {'destination_key': hex_key }
kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
elapsed = time.time() - start
print "kmeans trial %s end on ", trial, 'took', elapsed, 'seconds.', \
"%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

### print h2o.dump_json(kmeans)

print "Trial #", trial, "completed\n"

if __name__ == '__main__':
h2o.unit_main()
4 changes: 2 additions & 2 deletions py/testdir_single_jvm/test_GBM_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def test_GBM_basic_benign(self):
'min_rows': 1,
'response': 'FNDX',
'classification': 1 if DO_CLASSIFICATION else 0,
}
}

kwargs = params.copy()
timeoutSecs = 1800
Expand Down Expand Up @@ -94,7 +94,7 @@ def test_GBM_basic_prostate(self):
'min_rows': 1,
'response': 'CAPSULE',
'classification': 1 if DO_CLASSIFICATION else 0,
}
}

kwargs = params.copy()
timeoutSecs = 1800
Expand Down
13 changes: 7 additions & 6 deletions py/testdir_single_jvm/test_KMeans_covtype_fvec.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,13 @@ def test_KMeans_covtype_fvec(self):

for trial in range(3):
kwargs = {
'source': u'covtype.hex',
'destination_key': 'covtype.data_2.hex',
'initialization': 'Furthest',
# 'max_iter': 20,
'max_iter': 50,
'k': 2,
'k': 3,
'initialization': 'Furthest',
'ignored_cols': range(11, inspect['numCols']),
'max_iter': 10,
# 'normalize': 0,
# reuse the same seed, to get deterministic results
'seed': 265211114317615310
}

start = time.time()
Expand Down
Loading

0 comments on commit 9e4a8d1

Please sign in to comment.