Merge branch 'master' of github.com:0xdata/h2o

Richiexy · Aug 9, 2014 · 868664f · 868664f
2 parents 0942962 + b36051b
commit 868664f
Show file tree

Hide file tree

Showing 8 changed files with 87 additions and 21 deletions.
diff --git a/R/tests/testdir_algos/deeplearning/runit_DeepLearning_imbalance_large.R b/R/tests/testdir_algos/deeplearning/runit_DeepLearning_imbalance_large.R
@@ -31,7 +31,7 @@ check.deeplearning_imbalanced <- function(conn) {
       print("")
       print("--------------------")
   }
-  checkTrue(class_6_err_imbalanced >= class_6_err_balanced, "balance_classes makes it worse!")
+  checkTrue(class_6_err_imbalanced >= 0.9*class_6_err_balanced, "balance_classes makes it at least 10% worse!")
 
   testEnd()
 }

diff --git a/R/tests/testdir_algos/gbm/runit_GBM_imbalanced.R b/R/tests/testdir_algos/gbm/runit_GBM_imbalanced.R
@@ -2,15 +2,35 @@ setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
 source('../../findNSourceUtils.R')
 
 test.gbm.imbalanced <- function(conn) {
-  prostate = h2o.uploadFile(conn, locate("smalldata/logreg/prostate.csv"))
+  covtype = h2o.uploadFile(conn, locate("smalldata/covtype/covtype.20k.data"))
 
-  hh_imbalanced=h2o.gbm(x=c(1,2,3,5),y=4,n.trees=50,data=prostate,balance.classes=F,nfolds=10)
+  hh_imbalanced=h2o.gbm(x=c(1:54),y=55,n.trees=50,data=covtype,balance.classes=F,nfolds=10)
   print(hh_imbalanced)
-  hh_balanced=h2o.gbm(x=c(1,2,3,5),y=4,n.trees=50,data=prostate,balance.classes=T,nfolds=10)
+  hh_balanced=h2o.gbm(x=c(1:54),y=55,n.trees=50,data=covtype,balance.classes=T,nfolds=10)
   print(hh_balanced)
 
-  # test that it improves the overall classification error...
-  checkTrue(hh_imbalanced@model$confusion[4,4] > hh_balanced@model$confusion[4,4], "balance_classes makes it worse!")
+  #compare error for class 6 (difficult minority)
+  #confusion_matrix element at position A,P for N classes is at: model$confusion[P*(N+1)-(N-A+1)]
+  #Here, A=6 P=8, N=7 -> need element 8*(7+1)-(7-6+1) = 62
+
+  class_6_err_imbalanced = hh_imbalanced@model$confusion[62]
+  class_6_err_balanced = hh_balanced@model$confusion[62]
+
+  if (class_6_err_imbalanced < class_6_err_balanced) {
+      print("--------------------")
+      print("")
+      print("FAIL, balanced error greater than imbalanced error")
+      print("")
+      print("")
+      print("class_6_err_imbalanced")
+      print(class_6_err_imbalanced)
+      print("")
+      print("class_6_err_balanced")
+      print(class_6_err_balanced)
+      print("")
+      print("--------------------")
+  }
+  checkTrue(class_6_err_imbalanced >= class_6_err_balanced, "balance_classes makes it worse!")
 
   testEnd()
 }

diff --git a/R/tests/testdir_algos/rf/runit_RF_imbalanced.R b/R/tests/testdir_algos/rf/runit_RF_imbalanced.R
@@ -2,15 +2,35 @@ setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
 source('../../findNSourceUtils.R')
 
 test.rf.imbalanced <- function(conn) {
-  prostate = h2o.uploadFile(conn, locate("smalldata/logreg/prostate.csv"))
+  covtype = h2o.uploadFile(conn, locate("smalldata/covtype/covtype.20k.data"))
 
-  hh_imbalanced=h2o.randomForest(x=c(1,2,3,5),y=4,ntree=5,data=prostate,balance.classes=F,nfolds=10, type = "BigData")
+  hh_imbalanced=h2o.randomForest(x=c(1:54),y=55,ntree=50,data=covtype,balance.classes=F,nfolds=10, type = "BigData")
   print(hh_imbalanced)
-  hh_balanced=h2o.randomForest(x=c(1,2,3,5),y=4,ntree=5,data=prostate,balance.classes=T,nfolds=10, type = "BigData")
+  hh_balanced=h2o.randomForest(x=c(1:54),y=55,ntree=50,data=covtype,balance.classes=T,nfolds=10, type = "BigData")
   print(hh_balanced)
 
-  # test that it improves the overall classification error...
-  checkTrue(hh_imbalanced@model$confusion[4,4] > hh_balanced@model$confusion[4,4], "balance_classes makes it worse!")
+  #compare error for class 6 (difficult minority)
+  #confusion_matrix element at position A,P for N classes is at: model$confusion[P*(N+1)-(N-A+1)]
+  #Here, A=6 P=8, N=7 -> need element 8*(7+1)-(7-6+1) = 62
+
+  class_6_err_imbalanced = hh_imbalanced@model$confusion[62]
+  class_6_err_balanced = hh_balanced@model$confusion[62]
+
+  if (class_6_err_imbalanced < class_6_err_balanced) {
+      print("--------------------")
+      print("")
+      print("FAIL, balanced error greater than imbalanced error")
+      print("")
+      print("")
+      print("class_6_err_imbalanced")
+      print(class_6_err_imbalanced)
+      print("")
+      print("class_6_err_balanced")
+      print(class_6_err_balanced)
+      print("")
+      print("--------------------")
+  }
+  checkTrue(class_6_err_imbalanced >= 0.9*class_6_err_balanced, "balance_classes makes it at least 10% worse!")
 
   testEnd()
 }

diff --git a/R/tests/testdir_algos/rf/runit_speedrf_imbalanced.R b/R/tests/testdir_algos/rf/runit_speedrf_imbalanced.R
@@ -2,15 +2,35 @@ setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
 source('../../findNSourceUtils.R')
 
 test.speedrf.imbalanced <- function(conn) {
-  prostate = h2o.uploadFile(conn, locate("smalldata/logreg/prostate.csv"))
+  covtype = h2o.uploadFile(conn, locate("smalldata/covtype/covtype.20k.data"))
 
-  hh_imbalanced=h2o.randomForest(x=c(1,2,3,5),y=4,ntree=5,data=prostate,balance.classes=F,nfolds=10)
+  hh_imbalanced=h2o.randomForest(x=c(1:54),y=55,ntree=50,data=covtype,balance.classes=F,nfolds=10)
   print(hh_imbalanced)
-  hh_balanced=h2o.randomForest(x=c(1,2,3,5),y=4,ntree=5,data=prostate,balance.classes=T,nfolds=10)
+  hh_balanced=h2o.randomForest(x=c(1:54),y=55,ntree=50,data=covtype,balance.classes=T,nfolds=10)
   print(hh_balanced)
 
-  # test that it improves the overall classification error...
-  checkTrue(hh_imbalanced@model$confusion[4,4] >= hh_balanced@model$confusion[4,4], "balance_classes makes it worse!")
+  #compare error for class 6 (difficult minority)
+  #confusion_matrix element at position A,P for N classes is at: model$confusion[P*(N+1)-(N-A+1)]
+  #Here, A=6 P=8, N=7 -> need element 8*(7+1)-(7-6+1) = 62
+
+  class_6_err_imbalanced = hh_imbalanced@model$confusion[62]
+  class_6_err_balanced = hh_balanced@model$confusion[62]
+
+  if (class_6_err_imbalanced < class_6_err_balanced) {
+      print("--------------------")
+      print("")
+      print("FAIL, balanced error greater than imbalanced error")
+      print("")
+      print("")
+      print("class_6_err_imbalanced")
+      print(class_6_err_imbalanced)
+      print("")
+      print("class_6_err_balanced")
+      print(class_6_err_balanced)
+      print("")
+      print("--------------------")
+  }
+  checkTrue(class_6_err_imbalanced >= 0.9*class_6_err_balanced, "balance_classes makes it at least 10% worse!")
 
   testEnd()
 }

diff --git a/h2o-cookbook/src/test/java/cookbook/FrameCookbook.java b/h2o-cookbook/src/test/java/cookbook/FrameCookbook.java
@@ -75,6 +75,7 @@ public void frame_001() {
         //
         // If we did a DKV.remove() here instead of UKV.remove(), then the test would fail with
         // leaked keys.
+        fr.delete();
         UKV.remove(resultFrameKey);
     }
 }
diff --git a/h2o-cookbook/src/test/java/cookbook/KeyDemo.java b/h2o-cookbook/src/test/java/cookbook/KeyDemo.java
@@ -55,8 +55,10 @@ public void frame_001() {
 		Log.info("UKV okey after parse:" + UKV.get(okey));
 		Log.info("DKV okey after parse:" + DKV.get(okey));
 		Log.info("DKV okey get        :" + DKV.get(okey).get());
-
-		Log.info("Global Keyset count :" + H2O.globalKeySet(null).size());
+
+        H2O.KeySnapshot ks = H2O.KeySnapshot.globalSnapshot();
+        long keyCount = ks.keys().length;
+		Log.info("Global Keyset count :" + keyCount);
 		Log.info("Sanity check:key count after a few prints should not change anything;But not so straightforward if more than 1 nodes: "+H2O.store_size());
 		//Log.info(H2O.STORE.toString());
 

diff --git a/src/main/java/water/persist/PersistHdfs.java b/src/main/java/water/persist/PersistHdfs.java
@@ -15,8 +15,7 @@
 import water.Job.ProgressMonitor;
 import water.api.Constants;
 import water.api.Constants.Extensions;
-import water.fvec.HdfsFileVec;
-import water.fvec.Vec;
+import water.fvec.*;
 import water.util.*;
 import water.util.Log.Tag.Sys;
 
@@ -134,7 +133,7 @@ public static InputStream openStream(Key k, ProgressMonitor pmon) throws IOExcep
     long skip = 0;
     Key k = v._key;
     if(k._kb[0] == Key.DVEC)
-      skip = water.fvec.NFSFileVec.chunkOffset(k); // The offset
+      skip = FileVec.chunkOffset(k); // The offset
     final Path p = _iceRoot == null?new Path(getPathForKey(k)):new Path(_iceRoot, getIceName(v));
     final long skip_ = skip;
     run(new Callable() {

diff --git a/src/main/java/water/persist/PersistS3.java b/src/main/java/water/persist/PersistS3.java
@@ -8,6 +8,7 @@
 import water.*;
 import water.Job.ProgressMonitor;
 import water.api.Constants.Extensions;
+import water.fvec.FileVec;
 import water.fvec.Vec;
 import water.util.Log;
 import water.util.RIStream;
@@ -94,6 +95,9 @@ public static Key loadKey(S3ObjectSummary obj) throws IOException {
     byte[] b = MemoryManager.malloc1(v._max);
     Key k = v._key;
     long skip = 0;
+    // Skip offset based on chunk number
+    if(k._kb[0] == Key.DVEC)
+      skip = FileVec.chunkOffset(k); // The offset
     // Too complicate matters, S3 likes to reset connections when H2O hits it
     // too hard.  We "fix" this by just trying again, assuming we're getting
     // hit with a bogus resource limit (H2O doing a parse looks like a DDOS to