Merge branch 'master' of github.com:0xdata/h2o

obisu · Aug 6, 2014 · 9e4a8d1 · 9e4a8d1
2 parents 862bb28 + 7888375
commit 9e4a8d1
Show file tree

Hide file tree

Showing 22 changed files with 1,542 additions and 92 deletions.
diff --git a/R/tests/testdir_algos/glm/runit_GLM_perfectSeparation_balanced.R b/R/tests/testdir_algos/glm/runit_GLM_perfectSeparation_balanced.R
@@ -12,20 +12,9 @@ source('../../findNSourceUtils.R')
 
 test <- function(conn) {
 
-    print("Generate balanced dataset by column in R")
-        y.a = sample(0:0, 200, replace=T)
-        y.b = sample(1:1, 200, replace=T)
-        x1.a = sample(-1203:-1, 200, replace=T)
-        x1.b = sample(1:1, 200, replace=T)
-        x2.a = sample(0:0, 200, replace=T)
-        x2.b = sample(1:1203, 200, replace=T)
-        data.a = cbind(y.a, x1.a, x2.a)
-        data.b = cbind(y.b, x1.b, x2.b)
-        data.balanced = rbind(data.a, data.b)
-        colnames(data.balanced) <- c("y", "x1", "x2")
-
-    print("Read data into H20.")
-        data.b.hex <- as.h2o(conn, as.data.frame(data.balanced), "data.b.hex")
+    print("Read in synthetic balanced dataset")
+        data.b.hex <- h2o.uploadFile(conn, locate("smalldata/synthetic_perfect_separation/balanced.csv"), key="data.b.hex")
+
     print("Fit model on dataset.")
         model.balanced <- h2o.glm(x=c("x1", "x2"), y="y", data.b.hex, family="binomial", lambda_search=TRUE, use_all_factor_levels=1, alpha=0.5, nfolds=0, higher_accuracy=TRUE, lambda=0)
     print("Check line search invoked even with higher_accuracy off")

diff --git a/R/tests/testdir_algos/glm/runit_GLM_perfectSeparation_unbalanced.R b/R/tests/testdir_algos/glm/runit_GLM_perfectSeparation_unbalanced.R
@@ -12,15 +12,9 @@ source('../../findNSourceUtils.R')
 
 test <- function(conn) {
 
-    print("Generate unbalanced dataset by column in R")
-        y = sample(0:0, 10000, replace=T)
-        x1 = sample(-1:-10, 10000, replace=T)
-        x2 = sample(6:10, 10000, replace=T)
-        data = cbind(y, x1, x2)
-        data.unbalanced = rbind(data, c(1, 30, 7))
-
-    print("Read data into H20.")
-        data.u.hex <- as.h2o(conn, as.data.frame(data.unbalanced), "data.u.hex")
+    print("Read in synthetic unbalanced dataset")
+        data.u.hex <- h2o.uploadFile(conn, locate("smalldata/synthetic_perfect_separation/unbalanced.csv"), key="data.u.hex")
+
     print("Fit model on dataset.")
         model.unbalanced <- h2o.glm(x=c("x1", "x2"), y="y", data.u.hex, family="binomial", lambda_search=TRUE, use_all_factor_levels=1, alpha=0.5, nfolds=0, higher_accuracy=TRUE, lambda=0)
     print("Check line search invoked even with higher_accuracy off")

diff --git a/R/tests/testdir_munging/binop/runit_binop2_starCol.R b/R/tests/testdir_munging/binop/runit_binop2_starCol.R
@@ -16,7 +16,7 @@ doSelect<-
 function() {
     d <- select()
     dd <- d[[1]]$ATTRS
-    if(any(dd$TYPES != "enum")) return(d)
+   if(any(dd$TYPES != "enum")) return(d)
     Log.info("No numeric columns found in data, trying a different selection")
     doSelect()
 }
@@ -38,52 +38,19 @@ test.slice.star <- function(conn) {
   if(any(dd$TYPES == "enum")) anyEnum <- TRUE
 
   Log.info("Try adding scalar to a numeric column: 5 * hex[,col]")
-  #col <- sample(colnames[colTypes != "enum"], 1)
-  #col <- ifelse(is.na(suppressWarnings(as.numeric(col))), col, as.numeric(col) + 1)
-  #col <- ifelse(is.na(suppressWarnings(as.numeric(col))), col, paste("C", col, sep = "", collapse = ""))
-  df <- head(hex)
-  col <- sample(colnames(df[!sapply(df, is.factor)]), 1)
-  if (!(grepl("\\.", col))) {
-    col <- gsub("\\.", " ", sample(colnames(df[!sapply(df, is.factor)]), 1))
-  }
 
-  print(which(col == colnames(df)))
-
-  print(colnames(hex))
-  print(col)
-
-  print(col %in% colnames(hex))
-  print(col %in% colnames(df))
-
-  if (!(col %in% colnames(hex))) {
-    col <- which(col == colnames(df))
-  }
+  col <- sample(ncol(hex), 1)
 
   Log.info(paste("Using column: ", col))
 
   sliced <- hex[,col]
   Log.info("Placing key \"sliced.hex\" into User Store")
   sliced <- h2o.assign(sliced, "sliced.hex")
-  print(h2o.ls(conn))
 
   Log.info("*ing 5 to sliced.hex")
   slicedStarFive <- sliced * 5
   slicedStarFive <- h2o.assign(slicedStarFive, "slicedStarFive.hex")
 
-  Log.info("Orignal sliced: ")
-  df_head <- as.data.frame(sliced)
-  df_head <- data.frame(apply(df_head, 1:2, toDouble))
-  print(head(df_head))
-
-  Log.info("Sliced * 5: ")
-  df_slicedStarFive <- as.data.frame(slicedStarFive)
-  df_slicedStarFive <- data.frame(apply(df_slicedStarFive, 1:2, toDouble))
-  df_sliced <- as.data.frame(sliced)
-  df_sliced <- data.frame(apply(df_sliced, 1:2, toDouble))
-  print(head(df_slicedStarFive))
-
-  expect_that(df_slicedStarFive, equals(5 * df_sliced  ))
-
   Log.info("Checking left and right: ")
   slicedStarFive <- sliced * 5
   fiveStarSliced <- 5 * sliced
@@ -94,14 +61,6 @@ test.slice.star <- function(conn) {
   Log.info("5 * sliced: ")
   print(head(fiveStarSliced))
 
-  df_slicedStarFive <- as.data.frame(slicedStarFive)
-  df_slicedStarFive <- data.frame(apply(df_slicedStarFive, 1:2, toDouble))
-  df_sliced <- as.data.frame(fiveStarSliced)
-  df_fiveStarSliced <- data.frame(apply(df_sliced, 1:2, toDouble))
-
-  expect_that(df_slicedStarFive, equals(df_fiveStarSliced))
-
-
   Log.info("Checking the variation of H2OParsedData * H2OParsedData")
 
   hexStarHex <- fiveStarSliced * slicedStarFive
@@ -110,10 +69,6 @@ test.slice.star <- function(conn) {
   print(head(hexStarHex))
 
   Log.info("as.data.frame(fiveStarSliced) * as.data.frame(fiveStarSliced)")
-
-
-  print(head(df_fiveStarSliced*df_fiveStarSliced))
-  expect_that(as.data.frame(hexStarHex), equals(df_fiveStarSliced*df_fiveStarSliced))
 
   testEnd()
 }

diff --git a/py/testdir_multi_jvm/test_KMeansGrid_params_rand2_fvec.py b/py/testdir_multi_jvm/test_KMeansGrid_params_rand2_fvec.py
@@ -76,7 +76,7 @@ def test_KMeansGrid_params_rand2_fvec(self):
                 elapsed = time.time() - start
                 print "FIX! how do we get results..need redirect_url"
                 print "Have to inspect different models? (grid)"
-                print "kmeans grid end on ", csvPathname, 'took', elapsed, 'seconds.', \
+                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                     "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                 # h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
 

diff --git a/py/testdir_multi_jvm/test_KMeans_covtype_cols_fvec.py b/py/testdir_multi_jvm/test_KMeans_covtype_cols_fvec.py
@@ -0,0 +1,90 @@
+import unittest
+import random, sys, time, os
+sys.path.extend(['.','..','py'])
+import h2o, h2o_cmd, h2o_hosts, h2o_browse as h2b, h2o_import as h2i, h2o_kmeans
+
+class Basic(unittest.TestCase):
+    def tearDown(self):
+        h2o.check_sandbox_for_errors()
+
+    @classmethod
+    def setUpClass(cls):
+        global SEED, localhost
+        SEED = h2o.setup_random_seed()
+        localhost = h2o.decide_if_localhost()
+        if (localhost):
+            h2o.build_cloud(3,java_heap_GB=4)
+        else:
+            h2o_hosts.build_cloud_with_hosts() # uses import Hdfs for s3n instead of import folder
+
+    @classmethod
+    def tearDownClass(cls):
+        # wait while I inspect things
+        # time.sleep(1500)
+        h2o.tear_down_cloud()
+
+    def test_KMeans_covtype_cols_fvec(self):
+        h2o.beta_features = True
+        # just do the import folder once
+        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
+        # so probably 10x that for covtype200
+        csvFilenameList = [
+            ("covtype.binary.svm", "cC", 30, 1),
+            # normal csv
+        ]
+
+        ### csvFilenameList = random.sample(csvFilenameAll,1)
+        # h2b.browseTheCloud()
+        lenNodes = len(h2o.nodes)
+
+        firstDone = False
+        importFolderPath = "libsvm"
+        for (csvFilename, hex_key, timeoutSecs, resultMult) in csvFilenameList:
+            # have to import each time, because h2o deletes source after parse
+            csvPathname = importFolderPath + "/" + csvFilename
+
+            # PARSE******************************************
+            # creates csvFilename.hex from file in importFolder dir 
+            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, 
+                hex_key=hex_key, timeoutSecs=2000)
+            print "Parse result['destination_key']:", parseResult['destination_key']
+
+            # INSPECT******************************************
+            start = time.time()
+            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
+            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
+            h2o_cmd.infoFromInspect(inspect, csvFilename)
+            numRows = inspect['numRows']
+            numCols = inspect['numCols']
+
+            # KMEANS******************************************
+            for trial in range(1):
+                kwargs = {
+                    'k': 3, 
+                    'initialization': 'Furthest',
+                    'ignored_cols': range(11, numCols),
+                    'max_iter': 10,
+                    # 'normalize': 0,
+                    # reuse the same seed, to get deterministic results (otherwise sometimes fails
+                    'seed': 265211114317615310,
+                }
+
+                # fails if I put this in kwargs..i.e. source = dest
+                # 'destination_key': parseResult['destination_key'],
+
+                for trial2 in range(3):
+                    timeoutSecs = 600
+                    start = time.time()
+                    kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
+                    elapsed = time.time() - start
+                    print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
+                        "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
+                    # this does an inspect of the model and prints the clusters
+                    h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
+
+                    (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)
+
+
+
+if __name__ == '__main__':
+    h2o.unit_main()
diff --git a/py/testdir_multi_jvm/test_KMeans_create_frame_fvec.py b/py/testdir_multi_jvm/test_KMeans_create_frame_fvec.py
@@ -0,0 +1,123 @@
+import unittest, random, sys, time, json
+sys.path.extend(['.','..','py'])
+import h2o, h2o_cmd, h2o_hosts, h2o_kmeans, h2o_import as h2i, h2o_util
+
+def define_create_frame_params(SEED):
+    paramDict = {
+        'rows': [1, 100, 1000],
+        'cols': [1, 10, 100], # Number of data columns (in addition to the first response column)
+        'seed': [None, 1234],
+        'randomize': [None, 0, 1],
+        'value': [None, 0, 1234567890, 1e6, -1e6], # Constant value (for randomize=false)
+        'real_range': [None, 0, 1234567890, 1e6, -1e6], # -range to range
+        'categorical_fraction': [None, 0.1, 1.0], # Fraction of integer columns (for randomize=true)
+        'factors': [None, 0, 1], # Factor levels for categorical variables
+        'integer_fraction': [None, 0.1, 1.0], # Fraction of integer columns (for randomize=true)
+        'integer_range': [None, 0, 1, 1234567890], # -range to range
+        'missing_fraction': [None, 0.1, 1.0],
+        'response_factors': [None, 0, 1, 2, 10], # Number of factor levels of the first column (1=real, 2=binomial, N=multinomial)
+    }
+    return paramDict
+
+
+def define_KMeans_params(SEED):
+    paramDict = {
+        'k': [2, 5], # seems two slow tih 12 clusters if all cols
+        'initialization': ['None', 'PlusPlus', 'Furthest'],
+        'ignored_cols': [None, "0", "3", "0,1,2,3,4"],
+        'seed': [None, 12345678, SEED],
+        'normalize': [None, 0, 1],
+        'max_iter': [10,20,50],
+        # 'destination_key:': "junk",
+
+        }
+    return paramDict
+
+class Basic(unittest.TestCase):
+    def tearDown(self):
+        h2o.check_sandbox_for_errors()
+
+    @classmethod
+    def setUpClass(cls):
+        global SEED, localhost
+        SEED = h2o.setup_random_seed()
+        localhost = h2o.decide_if_localhost()
+        if (localhost):
+            h2o.build_cloud(3,java_heap_GB=4)
+        else:
+            h2o_hosts.build_cloud_with_hosts()
+
+    @classmethod
+    def tearDownClass(cls):
+        h2o.tear_down_cloud()
+
+    def test_KMeans_create_frame_fvec(self):
+        for trial in range(20):
+
+            cfParamDict = define_create_frame_params(SEED)
+            # default
+            params = {
+                'rows': 1,
+                'cols': 1
+            }
+            h2o_util.pickRandParams(cfParamDict, params)
+            i = params.get('integer_fraction', None)
+            c = params.get('categorical_fraction', None)
+            r = params.get('randomize', None)
+            v = params.get('value', None)
+
+            # h2o does some strict checking on the combinations of these things
+            # fractions have to add up to <= 1 and only be used if randomize
+            # h2o default randomize=1?
+            if r:
+                if not i:
+                    i = 0
+                if not c:
+                    c = 0
+                if (i and c) and (i + c) >= 1.0:
+                    c = 1.0 - i
+                params['integer_fraction'] = i
+                params['categorical_fraction'] = c
+                params['value'] = None
+
+            else:
+                params['randomize'] = 0
+                params['integer_fraction'] = 0
+                params['categorical_fraction'] = 0
+
+
+            kwargs = params.copy()
+            timeoutSecs = 300
+            hex_key = 'temp_%s.hex' % trial
+            cfResult = h2o.nodes[0].create_frame(key=hex_key, timeoutSecs=timeoutSecs, **kwargs)
+            inspect = h2o_cmd.runInspect(None, hex_key)
+            print "\n%s" % hex_key, \
+                "    numRows:", "{:,}".format(inspect['numRows']), \
+                "    numCols:", "{:,}".format(inspect['numCols'])
+
+            kmeansParamDict = define_KMeans_params(SEED)
+
+            # default
+            params = {
+                'max_iter': 20, 
+                'k': 1, 
+                'destination_key': "KM_" + str(trial) + '.hex'
+            }
+            h2o_kmeans.pickRandKMeansParams(kmeansParamDict, params)
+            kwargs = params.copy()
+
+            start = time.time()
+            parseResult = {'destination_key': hex_key }
+            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
+                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
+            elapsed = time.time() - start
+            print "kmeans trial %s end on ", trial, 'took', elapsed, 'seconds.', \
+                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
+            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)
+
+            ### print h2o.dump_json(kmeans)
+
+            print "Trial #", trial, "completed\n"
+
+if __name__ == '__main__':
+    h2o.unit_main()
diff --git a/py/testdir_single_jvm/test_GBM_basic.py b/py/testdir_single_jvm/test_GBM_basic.py
@@ -47,7 +47,7 @@ def test_GBM_basic_benign(self):
             'min_rows': 1,
             'response': 'FNDX',
             'classification': 1 if DO_CLASSIFICATION else 0,
-            }
+        }
 
         kwargs = params.copy()
         timeoutSecs = 1800
@@ -94,7 +94,7 @@ def test_GBM_basic_prostate(self):
             'min_rows': 1,
             'response': 'CAPSULE',
             'classification': 1 if DO_CLASSIFICATION else 0,
-            }
+        }
 
         kwargs = params.copy()
         timeoutSecs = 1800

diff --git a/py/testdir_single_jvm/test_KMeans_covtype_fvec.py b/py/testdir_single_jvm/test_KMeans_covtype_fvec.py
@@ -39,12 +39,13 @@ def test_KMeans_covtype_fvec(self):
 
             for trial in range(3):
                 kwargs = {
-                    'source': u'covtype.hex', 
-                    'destination_key': 'covtype.data_2.hex', 
-                    'initialization': 'Furthest', 
-                    # 'max_iter': 20, 
-                    'max_iter': 50, 
-                    'k': 2,
+                    'k': 3,
+                    'initialization': 'Furthest',
+                    'ignored_cols': range(11, inspect['numCols']),
+                    'max_iter': 10,
+                    # 'normalize': 0,
+                    # reuse the same seed, to get deterministic results
+                    'seed': 265211114317615310
                 }
 
                 start = time.time()