Skip to content

Commit

Permalink
Merge branch 'master' of github.com:0xdata/h2o
Browse files Browse the repository at this point in the history
  • Loading branch information
cliffclick committed Aug 9, 2014
2 parents 0942962 + b36051b commit 868664f
Show file tree
Hide file tree
Showing 8 changed files with 87 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ check.deeplearning_imbalanced <- function(conn) {
print("")
print("--------------------")
}
checkTrue(class_6_err_imbalanced >= class_6_err_balanced, "balance_classes makes it worse!")
checkTrue(class_6_err_imbalanced >= 0.9*class_6_err_balanced, "balance_classes makes it at least 10% worse!")

testEnd()
}
Expand Down
30 changes: 25 additions & 5 deletions R/tests/testdir_algos/gbm/runit_GBM_imbalanced.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,35 @@ setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source('../../findNSourceUtils.R')

test.gbm.imbalanced <- function(conn) {
prostate = h2o.uploadFile(conn, locate("smalldata/logreg/prostate.csv"))
covtype = h2o.uploadFile(conn, locate("smalldata/covtype/covtype.20k.data"))

hh_imbalanced=h2o.gbm(x=c(1,2,3,5),y=4,n.trees=50,data=prostate,balance.classes=F,nfolds=10)
hh_imbalanced=h2o.gbm(x=c(1:54),y=55,n.trees=50,data=covtype,balance.classes=F,nfolds=10)
print(hh_imbalanced)
hh_balanced=h2o.gbm(x=c(1,2,3,5),y=4,n.trees=50,data=prostate,balance.classes=T,nfolds=10)
hh_balanced=h2o.gbm(x=c(1:54),y=55,n.trees=50,data=covtype,balance.classes=T,nfolds=10)
print(hh_balanced)

# test that it improves the overall classification error...
checkTrue(hh_imbalanced@model$confusion[4,4] > hh_balanced@model$confusion[4,4], "balance_classes makes it worse!")
#compare error for class 6 (difficult minority)
#confusion_matrix element at position A,P for N classes is at: model$confusion[P*(N+1)-(N-A+1)]
#Here, A=6 P=8, N=7 -> need element 8*(7+1)-(7-6+1) = 62

class_6_err_imbalanced = hh_imbalanced@model$confusion[62]
class_6_err_balanced = hh_balanced@model$confusion[62]

if (class_6_err_imbalanced < class_6_err_balanced) {
print("--------------------")
print("")
print("FAIL, balanced error greater than imbalanced error")
print("")
print("")
print("class_6_err_imbalanced")
print(class_6_err_imbalanced)
print("")
print("class_6_err_balanced")
print(class_6_err_balanced)
print("")
print("--------------------")
}
checkTrue(class_6_err_imbalanced >= class_6_err_balanced, "balance_classes makes it worse!")

testEnd()
}
Expand Down
30 changes: 25 additions & 5 deletions R/tests/testdir_algos/rf/runit_RF_imbalanced.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,35 @@ setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source('../../findNSourceUtils.R')

test.rf.imbalanced <- function(conn) {
prostate = h2o.uploadFile(conn, locate("smalldata/logreg/prostate.csv"))
covtype = h2o.uploadFile(conn, locate("smalldata/covtype/covtype.20k.data"))

hh_imbalanced=h2o.randomForest(x=c(1,2,3,5),y=4,ntree=5,data=prostate,balance.classes=F,nfolds=10, type = "BigData")
hh_imbalanced=h2o.randomForest(x=c(1:54),y=55,ntree=50,data=covtype,balance.classes=F,nfolds=10, type = "BigData")
print(hh_imbalanced)
hh_balanced=h2o.randomForest(x=c(1,2,3,5),y=4,ntree=5,data=prostate,balance.classes=T,nfolds=10, type = "BigData")
hh_balanced=h2o.randomForest(x=c(1:54),y=55,ntree=50,data=covtype,balance.classes=T,nfolds=10, type = "BigData")
print(hh_balanced)

# test that it improves the overall classification error...
checkTrue(hh_imbalanced@model$confusion[4,4] > hh_balanced@model$confusion[4,4], "balance_classes makes it worse!")
#compare error for class 6 (difficult minority)
#confusion_matrix element at position A,P for N classes is at: model$confusion[P*(N+1)-(N-A+1)]
#Here, A=6 P=8, N=7 -> need element 8*(7+1)-(7-6+1) = 62

class_6_err_imbalanced = hh_imbalanced@model$confusion[62]
class_6_err_balanced = hh_balanced@model$confusion[62]

if (class_6_err_imbalanced < class_6_err_balanced) {
print("--------------------")
print("")
print("FAIL, balanced error greater than imbalanced error")
print("")
print("")
print("class_6_err_imbalanced")
print(class_6_err_imbalanced)
print("")
print("class_6_err_balanced")
print(class_6_err_balanced)
print("")
print("--------------------")
}
checkTrue(class_6_err_imbalanced >= 0.9*class_6_err_balanced, "balance_classes makes it at least 10% worse!")

testEnd()
}
Expand Down
30 changes: 25 additions & 5 deletions R/tests/testdir_algos/rf/runit_speedrf_imbalanced.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,35 @@ setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source('../../findNSourceUtils.R')

test.speedrf.imbalanced <- function(conn) {
prostate = h2o.uploadFile(conn, locate("smalldata/logreg/prostate.csv"))
covtype = h2o.uploadFile(conn, locate("smalldata/covtype/covtype.20k.data"))

hh_imbalanced=h2o.randomForest(x=c(1,2,3,5),y=4,ntree=5,data=prostate,balance.classes=F,nfolds=10)
hh_imbalanced=h2o.randomForest(x=c(1:54),y=55,ntree=50,data=covtype,balance.classes=F,nfolds=10)
print(hh_imbalanced)
hh_balanced=h2o.randomForest(x=c(1,2,3,5),y=4,ntree=5,data=prostate,balance.classes=T,nfolds=10)
hh_balanced=h2o.randomForest(x=c(1:54),y=55,ntree=50,data=covtype,balance.classes=T,nfolds=10)
print(hh_balanced)

# test that it improves the overall classification error...
checkTrue(hh_imbalanced@model$confusion[4,4] >= hh_balanced@model$confusion[4,4], "balance_classes makes it worse!")
#compare error for class 6 (difficult minority)
#confusion_matrix element at position A,P for N classes is at: model$confusion[P*(N+1)-(N-A+1)]
#Here, A=6 P=8, N=7 -> need element 8*(7+1)-(7-6+1) = 62

class_6_err_imbalanced = hh_imbalanced@model$confusion[62]
class_6_err_balanced = hh_balanced@model$confusion[62]

if (class_6_err_imbalanced < class_6_err_balanced) {
print("--------------------")
print("")
print("FAIL, balanced error greater than imbalanced error")
print("")
print("")
print("class_6_err_imbalanced")
print(class_6_err_imbalanced)
print("")
print("class_6_err_balanced")
print(class_6_err_balanced)
print("")
print("--------------------")
}
checkTrue(class_6_err_imbalanced >= 0.9*class_6_err_balanced, "balance_classes makes it at least 10% worse!")

testEnd()
}
Expand Down
1 change: 1 addition & 0 deletions h2o-cookbook/src/test/java/cookbook/FrameCookbook.java
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ public void frame_001() {
//
// If we did a DKV.remove() here instead of UKV.remove(), then the test would fail with
// leaked keys.
fr.delete();
UKV.remove(resultFrameKey);
}
}
6 changes: 4 additions & 2 deletions h2o-cookbook/src/test/java/cookbook/KeyDemo.java
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,10 @@ public void frame_001() {
Log.info("UKV okey after parse:" + UKV.get(okey));
Log.info("DKV okey after parse:" + DKV.get(okey));
Log.info("DKV okey get :" + DKV.get(okey).get());

Log.info("Global Keyset count :" + H2O.globalKeySet(null).size());

H2O.KeySnapshot ks = H2O.KeySnapshot.globalSnapshot();
long keyCount = ks.keys().length;
Log.info("Global Keyset count :" + keyCount);
Log.info("Sanity check:key count after a few prints should not change anything;But not so straightforward if more than 1 nodes: "+H2O.store_size());
//Log.info(H2O.STORE.toString());

Expand Down
5 changes: 2 additions & 3 deletions src/main/java/water/persist/PersistHdfs.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@
import water.Job.ProgressMonitor;
import water.api.Constants;
import water.api.Constants.Extensions;
import water.fvec.HdfsFileVec;
import water.fvec.Vec;
import water.fvec.*;
import water.util.*;
import water.util.Log.Tag.Sys;

Expand Down Expand Up @@ -134,7 +133,7 @@ public static InputStream openStream(Key k, ProgressMonitor pmon) throws IOExcep
long skip = 0;
Key k = v._key;
if(k._kb[0] == Key.DVEC)
skip = water.fvec.NFSFileVec.chunkOffset(k); // The offset
skip = FileVec.chunkOffset(k); // The offset
final Path p = _iceRoot == null?new Path(getPathForKey(k)):new Path(_iceRoot, getIceName(v));
final long skip_ = skip;
run(new Callable() {
Expand Down
4 changes: 4 additions & 0 deletions src/main/java/water/persist/PersistS3.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import water.*;
import water.Job.ProgressMonitor;
import water.api.Constants.Extensions;
import water.fvec.FileVec;
import water.fvec.Vec;
import water.util.Log;
import water.util.RIStream;
Expand Down Expand Up @@ -94,6 +95,9 @@ public static Key loadKey(S3ObjectSummary obj) throws IOException {
byte[] b = MemoryManager.malloc1(v._max);
Key k = v._key;
long skip = 0;
// Skip offset based on chunk number
if(k._kb[0] == Key.DVEC)
skip = FileVec.chunkOffset(k); // The offset
// Too complicate matters, S3 likes to reset connections when H2O hits it
// too hard. We "fix" this by just trying again, assuming we're getting
// hit with a bogus resource limit (H2O doing a parse looks like a DDOS to
Expand Down

0 comments on commit 868664f

Please sign in to comment.