Merge branch 'master' of github.com:0xdata/h2o

rknarukulla · Jan 8, 2014 · a3ab869 · a3ab869
2 parents 9282354 + b2a3ec9
commit a3ab869
Show file tree

Hide file tree

Showing 10 changed files with 188 additions and 78 deletions.
diff --git a/R/h2oRClient-package/R/Classes.R b/R/h2oRClient-package/R/Classes.R
@@ -558,41 +558,41 @@ setMethod("as.data.frame", "H2OParsedData", function(x) {
 
   # Substitute NAs for blank cells rather than skipping.
   df = read.csv(textConnection(ttt), blank.lines.skip = FALSE)
+  return(df)
 })
 
 setMethod("head", "H2OParsedData", function(x, n = 6L, ...) {
-  if(n == 0 || !is.numeric(n)) stop("n must be a non-zero integer")
-  n = round(n)
-  # if(abs(n) > nrow(x)) stop(paste("n must be between 1 and", nrow(x)))
   numRows = nrow(x)
-  if(n < 0 && abs(n) >= numRows) return(data.frame())
-  myView = ifelse(n > 0, min(n, numRows), numRows+n)
-  if(myView > MAX_INSPECT_VIEW) stop(paste("Cannot view more than", MAX_INSPECT_VIEW, "rows"))
-
-  res = h2o.__remoteSend(x@h2o, h2o.__PAGE_INSPECT, key=x@key, offset=0, view=myView)
-  temp = unlist(lapply(res$rows, function(y) { y$row = NULL; y }))
-  if(is.null(temp)) return(temp)
-  x.df = data.frame(matrix(temp, nrow = myView, byrow = TRUE))
-  colnames(x.df) = unlist(lapply(res$cols, function(y) y$name))
-  x.df
+  stopifnot(length(n) == 1L)
+  n <- ifelse(n < 0L, max(numRows + n, 0L), min(n, numRows))
+  if(n == 0) return(data.frame())
+
+  x.slice = as.data.frame(x[seq_len(n),])
+  res = h2o.__remoteSend(x@h2o, h2o.__PAGE_SUMMARY2, source = x@key)
+  x.lev = lapply(res$summaries, function(x) { x$hbrk })
+  for(i in 1:ncol(x)) {
+    if(res$summaries[[i]]$stats$type == 'Enum')
+      x.slice[,i] <- factor(x.slice[,i], levels = res$summaries[[i]]$hbrk)
+  }
+  return(x.slice)
 })
 
 setMethod("tail", "H2OParsedData", function(x, n = 6L, ...) {
-  if(n == 0 || !is.numeric(n)) stop("n must be a non-zero integer")
-  n = round(n)
-  # if(abs(n) > nrow(x)) stop(paste("n must be between 1 and", nrow(x)))
-  numRows = nrow(x)
-  if(n < 0 && abs(n) >= numRows) return(data.frame())
-  myOff = ifelse(n > 0, max(0, numRows-n), abs(n))
-  myView = ifelse(n > 0, min(n, numRows), numRows+n)
-  if(myView > MAX_INSPECT_VIEW) stop(paste("Cannot view more than", MAX_INSPECT_VIEW, "rows"))
-
-  res = h2o.__remoteSend(x@h2o, h2o.__PAGE_INSPECT, key=x@key, offset=myOff, view=myView)
-  temp = unlist(lapply(res$rows, function(y) { y$row = NULL; y }))
-  if(is.null(temp)) return(temp)
-  x.df = data.frame(matrix(temp, nrow = myView, byrow = TRUE))
-  colnames(x.df) = unlist(lapply(res$cols, function(y) y$name))
-  x.df
+  stopifnot(length(n) == 1L)
+  nrx <- nrow(x)
+  n <- ifelse(n < 0L, max(nrx + n, 0L), min(n, nrx))
+  if(n == 0) return(data.frame())
+
+  idx = seq.int(to = nrx, length.out = n)
+  x.slice = as.data.frame(x[idx,])
+  rownames(x.slice) = idx
+  res = h2o.__remoteSend(x@h2o, h2o.__PAGE_SUMMARY2, source = x@key)
+  x.lev = lapply(res$summaries, function(x) { x$hbrk })
+  for(i in 1:ncol(x)) {
+    if(res$summaries[[i]]$stats$type == 'Enum')
+      x.slice[,i] <- factor(x.slice[,i], levels = res$summaries[[i]]$hbrk)
+  }
+  return(x.slice)
 })
 
 setMethod("as.factor", "H2OParsedData", function(x) { h2o.__unop2("factor", x) })
@@ -822,12 +822,50 @@ setMethod("dim", "H2OParsedDataVA", function(x) {
   as.numeric(c(res$num_rows, res$num_cols))
 })
 
+setMethod("length", "H2OParsedData", function(x) { ncol(x) })
+
 setMethod("head", "H2OParsedDataVA", function(x, n = 6L, ...) {
-  head(new("H2OParsedData", h2o=x@h2o, key=x@key), n, ...)
+  numRows = nrow(x)
+  stopifnot(length(n) == 1L)
+  n <- ifelse(n < 0L, max(numRows + n, 0L), min(n, numRows))
+  if(n == 0) return(data.frame())
+  if(n > MAX_INSPECT_VIEW) stop(paste("Cannot view more than", MAX_INSPECT_VIEW, "rows"))
+
+  res = h2o.__remoteSend(x@h2o, h2o.__PAGE_INSPECT, key=x@key, offset=0, view=n)
+  temp = lapply(res$rows, function(y) { y$row = NULL; as.data.frame(y) })
+  if(is.null(temp)) return(temp)
+  x.slice = do.call(rbind, temp)
+
+  res2 = h2o.__remoteSend(x@h2o, h2o.__PAGE_SUMMARY2, source = x@key)
+  x.lev = lapply(res2$summaries, function(x) { x$hbrk })
+  for(i in 1:ncol(x)) {
+    if(res2$summaries[[i]]$stats$type == 'Enum')
+      x.slice[,i] <- factor(x.slice[,i], levels = res2$summaries[[i]]$hbrk)
+  }
+  return(x.slice)
 })
 
 setMethod("tail", "H2OParsedDataVA", function(x, n = 6L, ...) {
-  tail(new("H2OParsedData", h2o=x@h2o, key=x@key), n, ...)
+  stopifnot(length(n) == 1L)
+  nrx <- nrow(x)
+  n <- ifelse(n < 0L, max(nrx + n, 0L), min(n, nrx))
+  if(n == 0) return(data.frame())
+  if(n > MAX_INSPECT_VIEW) stop(paste("Cannot view more than", MAX_INSPECT_VIEW, "rows"))
+
+  idx = seq.int(to = nrx, length.out = n)
+  res = h2o.__remoteSend(x@h2o, h2o.__PAGE_INSPECT, key=x@key, offset=idx[1], view=length(idx))
+  temp = lapply(res$rows, function(y) { y$row = NULL; as.data.frame(y) })
+  if(is.null(temp)) return(temp)
+  x.slice = do.call(rbind, temp)
+  rownames(x.slice) = idx
+
+  res2 = h2o.__remoteSend(x@h2o, h2o.__PAGE_SUMMARY2, source = x@key)
+  x.lev = lapply(res2$summaries, function(x) { x$hbrk })
+  for(i in 1:ncol(x)) {
+    if(res2$summaries[[i]]$stats$type == 'Enum')
+      x.slice[,i] <- factor(x.slice[,i], levels = res2$summaries[[i]]$hbrk)
+  }
+  return(x.slice)
 })
 
 setMethod("summary", "H2OParsedDataVA", function(object) {

diff --git a/R/h2oRClient-package/demo/h2o.kmeans.R b/R/h2oRClient-package/demo/h2o.kmeans.R
@@ -19,16 +19,16 @@ if("fpc" %in% rownames(installed.packages())) {
   library(fpc)
 
   par(mfrow=c(1,1))
-  plotcluster(prostate.data, prostate.clus$predict)
+  plotcluster(prostate.data, prostate.clus[,1])
   title("K-Means Classification for k = 10")
 }
 
 # if(!"cluster" %in% rownames(installed.packages())) install.packages("cluster")
 if("cluster" %in% rownames(installed.packages())) {
   library(cluster)
-  clusplot(prostate.data, prostate.clus$predict, color = TRUE, shade = TRUE)
+  clusplot(prostate.data, prostate.clus[,1], color = TRUE, shade = TRUE)
 }
-pairs(prostate.data[,c(2,3,7,8)], col=prostate.clus$predict)
+pairs(prostate.data[,c(2,3,7,8)], col=prostate.clus[,1])
 
 # Plot k-means centers
 par(mfrow = c(1,2))

diff --git a/R/tests/testdir_golden/runit_gbm_1_golden.R b/R/tests/testdir_golden/runit_gbm_1_golden.R
@@ -0,0 +1,43 @@
+setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"-f")))
+source('../findNSourceUtils.R')
+
+test.gbmMSEgauss.golden <- function(H2Oserver) {
+
+#Import data: 
+Log.info("Importing smtrees data...") 
+smtreesH2O<- h2o.uploadFile(H2Oserver, locate("../../smalldata/smtrees.csv"), key="smtreesH2O")
+smtreesR<- read.csv(locate("../../smalldata/smtrees.csv"))
+
+#Log.info("Test H2O generation of MSE for GBM")
+fith2o<- h2o.gbm(x=c("girth", "height"), y="vol", n.trees=10, interaction.depth=1, distribution="gaussian", n.minobsinnode=2, shrinkage=.1, data=smtreesH2O)
+
+#Reported MSE from H2O through R
+err<- as.data.frame(fith2o@model$err)
+REPMSE<- err[11,]
+
+#MSE Calculated by hand From H2O predicted values
+pred<- as.data.frame(h2o.predict(fith2o, newdata=smtreesH2O))
+diff<- pred-smtreesR[,4]
+diff<- diff[-1,]
+diffsq<- diff^2
+EXPMSE<- mean(diffsq)
+
+Log.info("Print model MSE... \n")
+Log.info(paste("Length of H2O MSE Vec: ", length(fith2o@model$err),      "\t\t", "Expected Length   : ", 10))
+Log.info(paste("H2O Reported MSE  : ", REPMSE, "\t\t", "R Expected MSE   : ", EXPMSE))
+
+Log.info("Compare model statistics in R to model statistics in H2O")
+expect_equal(length(fith2o@model$err), 10)
+expect_equal(REPMSE, EXPMSE, tolerance = 0.01)
+
+testEnd()
+}
+
+doTest("GBM Test: Golden GBM - MSE for GBM Regression", test.gbmMSEgauss.golden)
+
+
+
+
+
+
+
diff --git a/prj.el b/prj.el
@@ -7,7 +7,7 @@
  '(jde-run-option-debug nil)
  '(jde-run-option-vm-args (quote ("-XX:+PrintGC")))
  '(jde-compile-option-directory "./target/classes")
- '(jde-run-option-application-args (quote ("-beta" "-mainClass" "org.junit.runner.JUnitCore" "hex.gbm.GBMTest")))
+ '(jde-run-option-application-args (quote ("-beta" "-mainClass" "hex.drf.Runner")))
  '(jde-debugger (quote ("JDEbug")))
  '(jde-compile-option-source (quote ("1.6")))
  '(jde-compile-option-classpath (quote ("./target/classes" "./lib/javassist.jar" "./lib/hadoop/cdh4/hadoop-common.jar" "./lib/hadoop/cdh4/hadoop-auth.jar" "./lib/hadoop/cdh4/slf4j-api-1.6.1.jar" "./lib/hadoop/cdh4/slf4j-nop-1.6.1.jar" "./lib/hadoop/cdh4/hadoop-hdfs.jar" "./lib/hadoop/cdh4/protobuf-java-2.4.0a.jar" "./lib/apache/commons-codec-1.4.jar" "./lib/apache/commons-configuration-1.6.jar" "./lib/apache/commons-lang-2.4.jar" "./lib/apache/commons-logging-1.1.1.jar" "./lib/apache/httpclient-4.1.1.jar" "./lib/apache/httpcore-4.1.jar" "./lib/junit/junit-4.11.jar" "./lib/apache/guava-12.0.1.jar" "./lib/gson/gson-2.2.2.jar" "./lib/poi/poi-3.8-20120326.jar" "./lib/poi/poi-ooxml-3.8-20120326.jar" "./lib/poi/poi-ooxml-schemas-3.8-20120326.jar" "./lib/poi/dom4j-1.6.1.jar" "./lib/Jama/Jama.jar" "./lib/s3/aws-java-sdk-1.3.27.jar" "./lib/log4j/log4j-1.2.15.jar")))

diff --git a/smalldata/allstate/claim_prediction_train_set_10000_bool.csv.gz b/smalldata/allstate/claim_prediction_train_set_10000_bool.csv.gz
diff --git a/smalldata/smtrees.csv b/smalldata/smtrees.csv
@@ -0,0 +1,11 @@
+"","girth","height","vol"
+"1",17.9,85,51.5
+"2",14.2,83,21.4
+"3",8.6,70,15.6
+"4",11.7,82,77
+"5",20.6,81,24.2
+"6",8.3,77,42.6
+"7",18,78,24.9
+"8",18,81,55.7
+"9",16,76,31.7
+"10",17.3,80,21
diff --git a/src/main/java/water/fvec/ParseDataset2.java b/src/main/java/water/fvec/ParseDataset2.java
@@ -421,7 +421,7 @@ public MultiFileParseTask dfork(Key... keys){
           // Zipped file; no parallel decompression;
           ParseProgressMonitor pmon = new ParseProgressMonitor(_progress);
           _dout = streamParse(new GZIPInputStream(vec.openStream(pmon)),localSetup,_vecIdStart, chunkStartIdx,pmon);
-          // set this node as the one which rpocessed all the chunks
+          // set this node as the one which processed all the chunks
           for(int i = 0; i < vec.nChunks(); ++i)
             _chunk2Enum[chunkStartIdx + i] = H2O.SELF.index();
           break;

diff --git a/src/main/java/water/fvec/Vec.java b/src/main/java/water/fvec/Vec.java
@@ -1,12 +1,8 @@
 package water.fvec;
 
-import java.util.Arrays;
 import java.util.UUID;
 
 import water.*;
-import water.H2O.H2OCallback;
-import water.H2O.H2OCountedCompleter;
-import water.H2O.H2OEmptyCompleter;
 import water.util.Utils;
 import static water.util.Utils.seq;
 
@@ -362,7 +358,7 @@ public Key chunkKey(int cidx ) {
     return Key.make(bits);
   }
   /** Get a Chunk's Value by index.  Basically the index-to-key map,
-   *  plus the {@link DKV.get}.  Warning: this pulls the data locally;
+   *  plus the {@code DKV.get()}.  Warning: this pulls the data locally;
    *  using this call on every Chunk index on the same node will
    *  probably trigger an OOM!  */
   public Value chunkIdx( int cidx ) {
@@ -373,7 +369,7 @@ public Value chunkIdx( int cidx ) {
 
   protected boolean checkMissing(int cidx, Value val) {
     if( val != null ) return true;
-    assert val != null : "Missing chunk "+cidx+" for "+_key;
+    System.out.println("Missing chunk "+cidx+" for "+_key);
     return false;
   }
 
@@ -422,8 +418,9 @@ public Chunk elem2BV( int cidx ) {
     long start = chunk2StartElem(cidx); // Chunk# to chunk starting element#
     Value dvec = chunkIdx(cidx);        // Chunk# to chunk data
     Chunk c = dvec.get();               // Chunk data to compression wrapper
-    if( c._start == start ) return c;   // Already filled-in
-    assert c._start == -1 || c._start == start; // Second term in case multi-thread access
+    long cstart = c._start;             // Read once, since racily filled in
+    if( cstart == start ) return c;     // Already filled-in
+    assert cstart == -1;       // Was not filled in (everybody racily writes the same start value)
     c._start = start;          // Fields not filled in by unpacking from Value
     c._vec = this;             // Fields not filled in by unpacking from Value
     return c;
@@ -433,14 +430,6 @@ public final Chunk chunk( long i ) {
     return elem2BV(elem2ChunkIdx(i));
   }
 
-  /** Next Chunk from the current one. */
-  final Chunk nextBV( Chunk bv ) {
-    int cidx = bv.cidx()+1;
-    Chunk next =  cidx == nChunks() ? null : elem2BV(cidx);
-    assert next == null || next.cidx() == cidx;
-    return next;
-  }
-
   /** Fetch element the slow way, as a long.  Floating point values are
    *  silently rounded to an integer.  Throws if the value is missing. */
   public final long  at8( long i ) { return chunk(i).at8(i); }
@@ -483,8 +472,7 @@ public void remove( Futures fs ) {
   }
 
   @Override public boolean equals( Object o ) {
-    if( !(o instanceof Vec) ) return false;
-    return ((Vec)o)._key.equals(_key);
+    return o instanceof Vec && ((Vec)o)._key.equals(_key);
   }
   @Override public int hashCode() { return _key.hashCode(); }
 
@@ -565,10 +553,10 @@ public int reserveKeys(final int n){
     }
     /**
      * Gets the next n keys of this group.
-     * Performs atomic udpate of the group object to assure we get unique keys.
-     * The group size will be udpated by adding n.
+     * Performs atomic update of the group object to assure we get unique keys.
+     * The group size will be updated by adding n.
      *
-     * @param n
+     * @param n number of keys to make
      * @return arrays of unique keys belonging to this group.
      */
     public Key [] addVecs(final int n){
@@ -585,8 +573,7 @@ public int reserveKeys(final int n){
     }
 
     @Override public boolean equals( Object o ) {
-      if( !(o instanceof VectorGroup) ) return false;
-      return ((VectorGroup)o)._key.equals(_key);
+      return o instanceof VectorGroup && ((VectorGroup)o)._key.equals(_key);
     }
     @Override public int hashCode() {
       return _key.hashCode();
@@ -603,11 +590,10 @@ public static class CollectDomain extends MRTask2<CollectDomain> {
 
     public CollectDomain(Vec v) { _ymin = (int) v.min(); _nclass = (int)(v.max()-_ymin+1); }
     @Override public void map(Chunk ys) {
-      int ycls=0;
       for( int row=0; row<ys._len; row++ ) {
         if (ys.isNA0(row)) continue;
-        ycls = (int)ys.at80(row)-_ymin;
-        _dom[ycls] = 1; // Only write to shared array
+        int ycls = (int)ys.at80(row)-_ymin;
+        if( _dom[ycls] == 0 ) _dom[ycls] = 1; // Only write to shared array
       }
     }
 

diff --git a/src/main/java/water/parser/CsvParser.java b/src/main/java/water/parser/CsvParser.java
@@ -445,7 +445,10 @@ public CsvParser clone(){
       dout.rollbackLine();
     // If offset is still validly within the buffer, save it so the next pass
     // can start from there.
-    if( offset+1 < bits.length ) din.setChunkDataStart(cidx+1, offset+1 );
+    if( offset+1 < bits.length ) {
+      if( state == EXPECT_COND_LF && bits[offset+1] == CHAR_LF ) offset++;
+      if( offset+1 < bits.length ) din.setChunkDataStart(cidx+1, offset+1 );
+    }
     return dout;
   }