Merge branch 'master' of https://github.com/0xdata/h2o

eric-code · May 1, 2014 · c6d87de · c6d87de
2 parents 9a33543 + 90c48cf
commit c6d87de
Show file tree

Hide file tree

Showing 17 changed files with 99 additions and 72 deletions.
diff --git a/R/h2o-package/NAMESPACE b/R/h2o-package/NAMESPACE
@@ -6,7 +6,7 @@ export('h2o.shutdown', 'h2o.init', 'h2o.glm', 'h2o.glm.FV', 'h2o.kmeans', 'h2o.r
        'h2o.importFolder', 'h2o.uploadFile', 'h2o.parseRaw', 'h2o.importFile.VA', 'h2o.importFolder.VA', 
        'h2o.importHDFS.VA', 'h2o.importURL.VA', 'h2o.uploadFile.VA', 'h2o.parseRaw.VA', 'h2o.ls', 'h2o.rm',
        'summary.H2OParsedData', 'summary.H2OParsedDataVA', 'screeplot.H2OPCAModel', 'h2o.cut', '.', 'h2o..',
-       'cbind.H2OParsedData', '[[.H2OParsedData', '[[<-.H2OParsedData', 'unique.H2OParsedData')
+       'cbind.H2OParsedData', 'unique.H2OParsedData')
 exportMethods(
   "colnames",
   "show",

diff --git a/R/h2o-package/R/Algorithms.R b/R/h2o-package/R/Algorithms.R
@@ -1201,7 +1201,7 @@ h2o.SpeeDRF <- function(x, y, data, classification=TRUE, validation,
     #}
 
     class_names = tail(res$'_domains', 1)[[1]]
-    result$confusion = .build_cm(res$'cm', class_names)
+    result$confusion = .build_cm(tail(res$cms, 1)[[1]]$'_arr', class_names)
   }
 
   return(result)

diff --git a/R/h2o-package/R/Classes.R b/R/h2o-package/R/Classes.R
@@ -590,26 +590,17 @@ setMethod("$<-", "H2OParsedData", function(x, name, value) {
   return(new("H2OParsedData", h2o=x@h2o, key=x@key))
 })
 
-`[[.H2OParsedData` <- function(x, ..., exact = TRUE) {
-  if( missing(x) ) stop('must specify x')
-  if( !class(x) == 'H2OParsedData') stop('x is the wrong class')
-
-  cols <- sapply(as.list(...), function(x) x)
-  if( length(cols) == 0 )
-    return(x)
-  if( length(cols) > 1 ) stop('[[]] may only select one column')
-  if( ! cols[1] %in% colnames(x) )
-    return(NULL)
-
-  x[, cols]
-}
+setMethod("[[", "H2OParsedData", function(x, i, exact = TRUE) {
+  if(missing(i)) return(x)
+  if(length(i) > 1) stop("[[]] may only select one column")
+  if(!i %in% colnames(x) ) return(NULL)
+  x[, i]
+})
 
-`[[<-.H2OParsedData` <- function(x, i, j, value) {
-  if( missing(x) ) stop('must specify x')
-  if( !inherits(x, 'H2OParsedData')) stop('x is the wrong class')
-  if( !inherits(value, 'H2OParsedData')) stop('can only append H2O data to H2O data')
-  if( ncol(value) > 1 ) stop('may only set a single column')
-  if( nrow(value) != nrow(x) ) stop(sprintf('replacement has %d row, data has %d', nrow(value), nrow(x)))
+setMethod("[[<-", "H2OParsedData", function(x, i, value) {
+  if( !inherits(value, 'H2OParsedData')) stop('Can only append H2O data to H2O data')
+  if( ncol(value) > 1 ) stop('May only set a single column')
+  if( nrow(value) != nrow(x) ) stop(sprintf('Replacement has %d row, data has %d', nrow(value), nrow(x)))
 
   mm <- match.call()
   col_name <- as.list(i)[[1]]
@@ -623,7 +614,7 @@ setMethod("$<-", "H2OParsedData", function(x, name, value) {
     colnames(x) <- cc
   }
   x
-}
+})
 
 # Note: right now, all things must be H2OParsedData
 cbind.H2OParsedData <- function(..., deparse.level = 1) {

diff --git a/R/h2o-package/man/h2o.SpeeDRF.Rd b/R/h2o-package/man/h2o.SpeeDRF.Rd
@@ -40,7 +40,7 @@ An \code{\linkS4class{H2OParsedData}} object containing the variables in the mod
   \item{sample.rate}{
   (Optional) Sampling rate for constructing data from which individual trees are grown.
   }
-  \item{oobe}{
+  \item{oobee}{
   (Optional) A logical value indicating whether to calculate the out of bag error estimate.
   }
   \item{nbins}{
@@ -64,6 +64,8 @@ An \code{\linkS4class{H2OParsedData}} object containing the variables in the mod
 }
 
 \details{
+IMPORTANT: Currently, you must initialize H2O with the flag \code{beta = TRUE} in \code{h2o.init} in order to use this method!
+
 This method runs random forest model building on a single node, as opposed to the multi-node implementation in \code{\link{h2o.randomForest.FV}}.
 }
 
@@ -79,13 +81,12 @@ An object of class \code{\linkS4class{H2OSpeeDRFModel}} with slots key, data, va
 }
 
 \seealso{
-\code{\linkS4Class{H2OSpeeDRFModel}}, \code{\link{h2o.randomForest}}
+\code{\linkS4class{H2OSpeeDRFModel}}, \code{\link{h2o.randomForest}}
 }
 
 \examples{
-# Run an RF model on iris data
 library(h2o)
-localH2O = h2o.init(ip = "localhost", port = 54321, startH2O = TRUE)
+localH2O = h2o.init(ip = "localhost", port = 54321, startH2O = TRUE, beta = TRUE)
 irisPath = system.file("extdata", "iris.csv", package = "h2o")
 iris.hex = h2o.importFile(localH2O, path = irisPath, key = "iris.hex")
 h2o.SpeeDRF(x = c(2,3,4), y = 5, data = iris.hex, ntree = 50, depth = 100)

diff --git a/R/h2o-package/man/h2o.gapStatistic.Rd b/R/h2o-package/man/h2o.gapStatistic.Rd
@@ -33,7 +33,7 @@ A list containing the following components:
 }
 
 \references{
-Tibshirani, R., Walther, G. and Hastie, T. (2001). Estimating the number of data clusters via the Gap statistic. \emph{Journal of the Royal Statistical Society B}, \strong{63}, 411–423.
+Tibshirani, R., Walther, G. and Hastie, T. (2001). Estimating the number of data clusters via the Gap statistic. \emph{Journal of the Royal Statistical Society B}, \strong{63}, 411-423.
 
 Tibshirani, R., Walther, G. and Hastie, T. (2000). Estimating the number of clusters in a dataset via the Gap statistic. Technical Report. Stanford.
 }
@@ -43,7 +43,7 @@ Tibshirani, R., Walther, G. and Hastie, T. (2000). Estimating the number of clus
 }
 \examples{
 library(h2o)
-localH2O = h2o.init(ip = "localhost", port = 54321, startH2O = TRUE)
+localH2O = h2o.init(ip = "localhost", port = 54321, startH2O = TRUE, beta = TRUE)
 irisPath = system.file("extdata", "iris.csv", package = "h2o")
 iris.hex = h2o.importFile(localH2O, path = irisPath)
 h2o.gapStatistic(iris.hex, K.max = 10, B = 100)

diff --git a/py/testdir_multi_jvm/test_NN2_mnist_multi.py b/py/testdir_multi_jvm/test_NN2_mnist_multi.py
@@ -74,9 +74,6 @@ def test_NN2_mnist_multi(self):
         nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
         print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds'
 
-        relTol = 0.10 ### 10% relative error is acceptable for Hogwild
-        h2o_nn.checkLastValidationError(self, nn['neuralnet_model'], inspect['numRows'], expectedErr, relTol, **kwargs)
-
         ### Now score using the model, and check the validation error
         kwargs = {
             'source' : validation_key,
@@ -90,8 +87,5 @@ def test_NN2_mnist_multi(self):
         nnScoreResult = h2o_cmd.runDeepLearningScore(key=parseResult['destination_key'], timeoutSecs=timeoutSecs, **kwargs)
         h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs)
 
-
-        h2o.beta_features = False
-
 if __name__ == '__main__':
     h2o.unit_main()
diff --git a/py/testdir_multi_jvm/test_many_fp_formats_libsvm.py b/py/testdir_multi_jvm/test_many_fp_formats_libsvm.py
@@ -227,18 +227,25 @@ def test_many_fp_formats_libsvm (self):
                         # we may not see the min/max range of values that was bounded by our gen, but 
                         # we can check that it's a subset of the allowed range
                         if synKey == 'min':
-                            self.assertTrue(syn[synKey] <= cols[synKey],
+                            # can have quoted numbers in json?
+                            self.assertTrue(syn[synKey] <= float(cols[synKey]),
                                 msg='col %s %s %s should be <= %s' % (k, synKey, cols[synKey], syn[synKey]))
                         elif synKey == 'max':
-                            self.assertTrue(syn[synKey] >= cols[synKey],
+                            # can have quoted numbers in json?
+                            self.assertTrue(syn[synKey] >= float(cols[synKey]),
                                 msg='col %s %s %s should be >= %s' % (k, synKey, cols[synKey], syn[synKey]))
+                        elif synKey == 'variance':
+                            # can have quoted numbers in json?
+                            self.assertTrue(syn[synKey] == float(cols[synKey]),
+                                msg='col %s %s %s should be == %s' % (k, synKey, cols[synKey], syn[synKey]))
                         elif synKey == 'size' or synKey == 'scale' or synKey == 'type':
                             if cols[synKey] not in syn[synKey]:
                                 # for debug of why it was a bad size
                                 print "cols size/min/max:", cols['size'], cols['min'], cols['max']
                                 print "syn size/min/max:", syn['size'], syn['min'], syn['max']
                                 raise Exception('col %s %s %s should be in this allowed %s' % (k, synKey, cols[synKey], syn[synKey]))
                         else:
+                            # can have quoted numbers in json?
                             self.assertEqual(syn[synKey], cols[synKey],
                                 msg='col %s %s %s should be %s' % (k, synKey, cols[synKey], syn[synKey]))
 

diff --git a/py/testdir_multi_jvm/test_many_fp_formats_libsvm_2.py b/py/testdir_multi_jvm/test_many_fp_formats_libsvm_2.py
@@ -175,7 +175,7 @@ def test_many_fp_formats_libsvm_2(self):
 
                     synMean = (v + 0.0)/rowCount
                     # enums don't have mean, but we're not enums
-                    mean = inspect['cols'][k]['mean']
+                    mean = float(inspect['cols'][k]['mean'])
                     # our fp formats in the syn generation sometimes only have two places?
                     self.assertAlmostEqual(mean, synMean, places=0,
                         msg='col %s mean %0.6f is not equal to generated mean %0.6f' % (k, mean, synMean))

diff --git a/py/testdir_multi_jvm/test_parse_bounds_csv.py b/py/testdir_multi_jvm/test_parse_bounds_csv.py
@@ -113,10 +113,10 @@ def test_parse_bounds_csv (self):
                     iColNameToOffset[iName] = iOffset
                     # just touching to make sure they are there
                     num_missing_values = iColDict['num_missing_values']
-                    iMin = iColDict['min']
-                    iMax = iColDict['max']
-                    iMean = iColDict['mean']
-                    iVariance = iColDict['variance']
+                    iMin = float(iColDict['min'])
+                    iMax = float(iColDict['max'])
+                    iMean = float(iColDict['mean'])
+                    iVariance = float(iColDict['variance'])
 
                 # SUMMARY********************************
                 summaryResult = h2o_cmd.runSummary(key=hex_key, max_column_display=colCount, timeoutSecs=timeoutSecs)

diff --git a/py/testdir_multi_jvm/test_parse_bounds_libsvm.py b/py/testdir_multi_jvm/test_parse_bounds_libsvm.py
@@ -149,10 +149,13 @@ def test_parse_bounds_libsvm (self):
                     # definitely not enums
                     zeros = columns['zeros']
                     na = columns['na']
+                    print h2o.dump_json(columns)
+                    print columns['max'], columns['min'], columns['mean'], columns['sigma']
+                    # these numbers aren't quoted. array of 5 min, 5 max
                     smax = columns['max']
                     smin = columns['min']
-                    mean = columns['mean']
-                    sigma = columns['sigma']
+                    mean = float(columns['mean'])
+                    sigma = float(columns['sigma'])
 
                     # a single 1 in the last col
                     if name == "V" + str(colNumberMax): # h2o puts a "V" prefix

diff --git a/py/testdir_multi_jvm/test_parse_libsvm.py b/py/testdir_multi_jvm/test_parse_libsvm.py
@@ -44,8 +44,8 @@ def test_parse_libsvm(self):
         # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
         # so probably 10x that for covtype200
         csvFilenameList = [
-            ("mnist_train.svm", "cM", 30, 0, 9, False, False),
-            ("covtype.binary.svm", "cC", 30, 1, 2, True, True),
+            ("mnist_train.svm", "cM", 30, 0, 9.0, False, False),
+            ("covtype.binary.svm", "cC", 30, 1, 2.0, True, True),
             # multi-label target like 1,2,5 ..not sure what that means
             # ("tmc2007_train.svm",  "cJ", 30, 0, 21.0, False, False),
             # illegal non-ascending cols
@@ -54,13 +54,13 @@ def test_parse_libsvm(self):
             # fails csvDownload
             ("duke.svm",           "cD", 30, -1.000000, 1.000000, False, False),
             ("colon-cancer.svm",   "cA", 30, -1.000000, 1.000000, False, False),
-            ("news20.svm",         "cH", 30, 1, 20, False, False), 
-            ("connect4.svm",       "cB", 30, -1, 1, False, False),
+            ("news20.svm",         "cH", 30, 1, 20.0, False, False), 
+            ("connect4.svm",       "cB", 30, -1, 1.0, False, False),
             # too many features? 150K inspect timeout?
             # ("E2006.train.svm",    "cE", 30, 1, -7.89957807346873 -0.519409526940154, False, False)
 
-            ("gisette_scale.svm",  "cF", 30, -1, 1, False, False),
-            ("mushrooms.svm",      "cG", 30, 1, 2, False, False),
+            ("gisette_scale.svm",  "cF", 30, -1, 1.0, False, False),
+            ("mushrooms.svm",      "cG", 30, 1, 2.0, False, False),
         ]
 
         ### csvFilenameList = random.sample(csvFilenameAll,1)
@@ -83,14 +83,15 @@ def test_parse_libsvm(self):
             h2o_cmd.infoFromInspect(inspectFirst, csvFilename)
             # look at the min/max for the target col (0) and compare to expected for the dataset
 
-            imin = inspectFirst['cols'][0]['min']
-            imax = inspectFirst['cols'][0]['max']
+            imin = float(inspectFirst['cols'][0]['min'])
+            # print h2o.dump_json(inspectFirst['cols'][0])
+            imax = float(inspectFirst['cols'][0]['max'])
 
             if expectedCol0Min:
                 self.assertEqual(imin, expectedCol0Min,
                     msg='col %s min %s is not equal to expected min %s' % (0, imin, expectedCol0Min))
             if expectedCol0Max:
-                self.assertEqual(imax, expectedCol0Max,
+                h2o_util.assertApproxEqual(imax, expectedCol0Max, tol=0.00000001,
                     msg='col %s max %s is not equal to expected max %s' % (0, imax, expectedCol0Max))
 
             print "\nmin/max for col0:", imin, imax
@@ -164,8 +165,8 @@ def test_parse_libsvm(self):
                     # make the check conditional based on the dataset
                     self.assertEqual(row_sizeA, row_sizeB,
                         "row_size mismatches after re-parse of downloadCsv result %d %d" % (row_sizeA, row_sizeB))
-                    self.assertEqual(value_size_bytesA, value_size_bytesB,
-                        "value_size_bytes mismatches after re-parse of downloadCsv result %d %d" % (value_size_bytesA, value_size_bytesB))
+                    h2o_util.assertApproxEqual(value_size_bytesA, value_size_bytesB, tol=0.00000001,
+                        msg="value_size_bytes mismatches after re-parse of downloadCsv result %d %d" % (value_size_bytesA, value_size_bytesB))
 
                 print "missingValuesListA:", missingValuesListA
                 print "missingValuesListB:", missingValuesListB

diff --git a/src/main/java/hex/FrameSplitter.java b/src/main/java/hex/FrameSplitter.java
@@ -107,7 +107,7 @@ private Vec[][] makeTemplates(Frame dataset, float[] ratios) {
       int[] splits = Utils.partitione(nrows, ratios);
       assert splits.length == ratios.length+1 : "Unexpected number of splits";
       for (int j=0; j<splits.length; j++) {
-        assert splits[j] > 0 : "Ups, no rows for " + j + "-th segment!";
+        //assert splits[j] > 0 : "Ups, no rows for " + j + "-th segment!";
         r[j][i+1] = r[j][i] + splits[j]; // previous + current number of rows
       }
     }
@@ -116,12 +116,14 @@ private Vec[][] makeTemplates(Frame dataset, float[] ratios) {
 
   private void onDone(boolean exceptional) {
     dataset.unlock(jobKey);
-    for (Frame s : splits) {
-      if (!exceptional) { // just unlock if everything was ok
-        s.update(jobKey);
-        s.unlock(jobKey);
-      } else { // else delete current half-done results
-        if (s!=null) s.delete(jobKey,0f);
+    if (splits!=null) { // if exception is hit before splits array is allocated
+      for (Frame s : splits) {
+        if (!exceptional) { // just unlock if everything was ok
+          s.update(jobKey);
+          s.unlock(jobKey);
+        } else { // else delete current half-done results
+          if (s!=null) s.delete(jobKey,0f);
+        }
       }
     }
   }

diff --git a/src/main/java/water/api/FrameSplitPage.java b/src/main/java/water/api/FrameSplitPage.java
@@ -42,11 +42,14 @@ public class FrameSplitPage extends Func {
     super.init();
     /* Check input parameters */
     float sum = 0;
+    long nrows = source.numRows();
+    if (nrows <= ratios.length) throw new IllegalArgumentException("Dataset does not have enough row to be split!");
     for (int i=0; i<ratios.length; i++) {
       if (!(ratios[i] > 0 && ratios[i] < 1)) throw new IllegalArgumentException("Split ration has to be in (0,1) interval!");
+      if (ratios[i] * nrows <= 1) throw new IllegalArgumentException("Ratio " + ratios[i] + " produces empty frame since the source frame has only " + nrows + "!");
       sum += ratios[i];
     }
-    if (sum>1) throw new IllegalArgumentException("Sum of split ratios has to be less or equal to 1!");
+    if (!(sum<1f)) throw new IllegalArgumentException("Sum of split ratios has to be less than 1!");
   }
 
   // Run the function
@@ -56,7 +59,7 @@ public class FrameSplitPage extends Func {
 
     Frame[] splits = fs.getResult();
 
-    split_keys = new Key[splits.length];
+    split_keys = new Key [splits.length];
     split_rows = new long[splits.length];
     float rsum = Utils.sum(ratios);
     split_ratios = Arrays.copyOf(ratios, splits.length);
@@ -72,30 +75,34 @@ public class FrameSplitPage extends Func {
 
   @Override public boolean toHTML(StringBuilder sb) {
     int nsplits = split_keys.length;
-    String [] headers = new String[nsplits+1];
+    String [] headers = new String[nsplits+2];
     headers[0] = "";
     for(int i=0; i<nsplits; i++) headers[i+1] = "Split #"+i;
+    headers[nsplits+1] = "Total";
     DocGen.HTML.arrayHead(sb, headers);
     // Key table row
     sb.append("<tr><td>").append(DocGen.HTML.bold("Keys")).append("</td>");
     for (int i=0; i<nsplits; i++) {
       Key k = split_keys[i];
-      sb.append("<td>").append(Inspect2.link(k.toString(), k)).append("</td>");
+      sb.append("<td>").append(Inspect2.link(k)).append("</td>");
     }
+    sb.append("<td>").append(Inspect2.link(source._key)).append("</td>");
     sb.append("</tr>");
     // Number of rows row
     sb.append("<tr><td>").append(DocGen.HTML.bold("Rows")).append("</td>");
     for (int i=0; i<nsplits; i++) {
       long r = split_rows[i];
       sb.append("<td>").append(String.format("%,d", r)).append("</td>");
     }
+    sb.append("<td>").append(String.format("%,d", Utils.sum(split_rows))).append("</td>");
     sb.append("</tr>");
     // Split ratios
     sb.append("<tr><td>").append(DocGen.HTML.bold("Ratios")).append("</td>");
     for (int i=0; i<nsplits; i++) {
       float r = 100*split_ratios[i];
       sb.append("<td>").append(String.format("%.2f %%", r)).append("</td>");
     }
+    sb.append("<td>").append(String.format("%.2f %%", 100*Utils.sum(split_ratios))).append("</td>");
     sb.append("</tr>");
     DocGen.HTML.arrayTail(sb);
     return true;

diff --git a/src/main/java/water/api/Inspect.java b/src/main/java/water/api/Inspect.java
@@ -255,10 +255,14 @@ public Response serveValueArray(final ValueArray va, int max_column) {
       json.addProperty(SIZE, Math.abs(c._size));
       json.addProperty(BASE, c._base);
       json.addProperty(SCALE, (int) c._scale);
-      json.addProperty(MIN,  (c.isEnum() || Double.isNaN(c._min)) ? "\"NaN\"" : String.valueOf(c._min));
-      json.addProperty(MAX,  (c.isEnum() || Double.isNaN(c._max)) ? "\"NaN\"" : String.valueOf(c._max));
-      json.addProperty(MEAN,  (c.isEnum() || Double.isNaN(c._mean)) ? "\"NaN\"" : String.valueOf(c._mean));
-      json.addProperty(VARIANCE,  (c.isEnum() || Double.isNaN(c._sigma)) ? "\"NaN\"" : String.valueOf(c._sigma));
+      if(c.isEnum() || Double.isNaN(c._min)) json.addProperty(MIN, "\"NaN\"");
+      else json.addProperty(MIN, c._min);
+      if(c.isEnum() || Double.isNaN(c._max)) json.addProperty(MAX, "\"NaN\"");
+      else json.addProperty(MAX, c._max);
+      if(c.isEnum() || Double.isNaN(c._mean)) json.addProperty(MEAN, "\"NaN\"");
+      else json.addProperty(MEAN, c._mean);
+      if(c.isEnum() || Double.isNaN(c._sigma)) json.addProperty(VARIANCE, "\"NaN\"");
+      else json.addProperty(VARIANCE, c._sigma);
       json.addProperty(NUM_MISSING_VALUES, va._numrows - c._n);
       json.addProperty(TYPE, c.isEnum() ? "enum" : (c.isFloat() ? "float" : "int"));
       json.addProperty(ENUM_DOMAIN_SIZE, c.isEnum() ? c._domain.length : 0);

diff --git a/src/main/java/water/api/Inspect2.java b/src/main/java/water/api/Inspect2.java
@@ -303,6 +303,9 @@ private String infoLink(Key k){
     return "<a href='/2/Inspect2.html?src_key=" + k.toString() + "&offset=-1'>info</a>";
   }
 
+  public static String link(Key k) {
+    return link(k.toString(), k.toString());
+  }
   public static String link(String txt,Key k) {
     return link(txt, k.toString());
   }

diff --git a/src/main/java/water/util/Utils.java b/src/main/java/water/util/Utils.java
@@ -1458,11 +1458,14 @@ public static final int[] partitione(int len, float[] ratio) {
     int[] r = new int[ratio.length+1];
     int sum = 0;
     int i = 0;
+    float sr = 0;
     for (i=0; i<ratio.length; i++) {
       r[i] = (int) (ratio[i]*len);
       sum += r[i];
+      sr  += ratio[i];
     }
-    r[i] = len - sum;
+    if (sr<1f) r[i] = len - sum;
+    else r[i-1] += (len-sum);
     return r;
   }