Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/0xdata/h2o
Browse files Browse the repository at this point in the history
  • Loading branch information
arnocandel committed Apr 5, 2014
2 parents d7cea02 + 669c912 commit 3c6315a
Show file tree
Hide file tree
Showing 15 changed files with 84 additions and 14 deletions.
10 changes: 8 additions & 2 deletions h2o-perf/bench/R/h2oPerf/prologue.R
Original file line number Diff line number Diff line change
Expand Up @@ -223,16 +223,22 @@ function(pkey, dataPath) {
#Modeling
runSummary.VA<-
function() {
data <- new("H2OParsedData", h2o = h, key = "parsed.hex", logic = TRUE)
data <- new("H2OParsedDataVA", h2o = h, key = "parsed.hex", logic = FALSE)
summary(data)
}

runSummary.FV<-
function() {
data <- new("H2OParsedDataVA", h2o = h, key = "parsed.hex", logic = FALSE)
data <- new("H2OParsedData", h2o = h, key = "parsed.hex", logic = TRUE)
summary(data)
}

runH2o.ddply<-
function(.variables, .fun = NULL, ..., .progress = 'none') {
data <- new("H2OParsedDataVA", h2o = h, key = "parsed.hex", logic = FALSE)
h2o.ddply(data, .variables, .fun, ..., .progress)
}

runGBM<-
function(x, y, distribution='multinomial',
n.trees=10, interaction.depth=5,
Expand Down
14 changes: 14 additions & 0 deletions h2o-perf/bench/tests/ddply_airlines-1B/ddply_airlines-1B.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[H2OBuildInformation]
aws = False
remote_hosts = True
heap_bytes_per_node = 200g
total_hosts = 1
total_nodes = 1
instance_type = none
nodes_per_host = 1

[Host1]
ip = 192.168.1.162
port = 54321
memory_bytes = 200g
num_cpus = 32
4 changes: 4 additions & 0 deletions h2o-perf/bench/tests/ddply_airlines-1B/model.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
source("../../R/h2oPerf/prologue.R")
data <- new("H2OParsedData", h2o = h, key = "parsed.hex", logic = TRUE)
h2o.ddply(data, .("C2", "C7", "C9", "C10", "C12"), nrow)
source("../../R/h2oPerf/epilogue.R")
7 changes: 7 additions & 0 deletions h2o-perf/bench/tests/ddply_airlines-1B/parse.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
source("../../R/h2oPerf/prologue.R")
data_source <<- "home-0xdiag-datasets"
trainData <<- "/home/0xdiag/datasets/airlines/airlines1B"
import.FV("parsed.hex", trainData)
num_train_rows <<- 1021368222
num_explan_cols <<- 12
source("../../R/h2oPerf/epilogue.R")
2 changes: 1 addition & 1 deletion h2o-perf/bench/tests/summary_one-billion-rows/parse.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
source("../../R/h2oPerf/prologue.R")
data_source <<- "home-0xdiag-datasets"
trainData <<- "/home/0xdiag/datasets/airlines/airlines1B"
import.VA("parsed.hex", trainData)
import.FV("parsed.hex", trainData)
num_train_rows <<- 1021368222
num_explan_cols <<- 12
source("../../R/h2oPerf/epilogue.R")
12 changes: 7 additions & 5 deletions py/h2o_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,9 @@ def delete_keys(node=None, pattern=None, timeoutSecs=120):
for k in keys:
if k in triedKeys:
print "Already tried to delete %s. Must have failed. Not trying again" % k
# don't delete the DRF __Tree__ keys. deleting the model does that. causes race conditions
elif '__Tree__' in k:
print "Not deleting a tree key from DRF: %s" % k
else:
node.remove_key(k['key'], timeoutSecs=timeoutSecs)
deletedCnt += 1
Expand All @@ -411,11 +414,6 @@ def delete_keys_at_all_nodes(node=None, pattern=None, timeoutSecs=120):
# TEMP: change this to remove_all_keys which ignores locking and removes keys?
# getting problems when tests fail in multi-test-on-one-h2o-cluster runner*sh tests
if not node: node = h2o.nodes[0]
# FIX! stop using RemoveAll for now. doesn't it hang or ?
if 1==0 and not pattern:
node.remove_all_keys(timeoutSecs=timeoutSecs)
return 0 # don't have a count of keys?

totalDeletedCnt = 0
# do it in reverse order, since we always talk to 0 for other stuff
# this will be interesting if the others don't have a complete set
Expand All @@ -429,6 +427,10 @@ def delete_keys_at_all_nodes(node=None, pattern=None, timeoutSecs=120):
print "Total: Deleted", totalDeletedCnt, "keys with filter=", pattern, "at", len(h2o.nodes), "nodes"
else:
print "Total: Deleted", totalDeletedCnt, "keys at", len(h2o.nodes), "nodes"
# do a remove_all_keys to clean out any locked keys also (locked keys will complain above)
# doesn't work if you remove job keys first, since it looks at the job list and gets confused
### node.remove_all_keys(timeoutSecs=timeoutSecs)

return totalDeletedCnt


Expand Down
4 changes: 2 additions & 2 deletions py/h2o_jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def pollStatsWhileBusy(timeoutSecs=300, pollTimeoutSecs=15, retryDelaySecs=5):
polls += 1
# get utilization and print it
# any busy jobs
a = h2o.nodes[0].jobs_admin(timeoutSecs=30)
a = h2o.nodes[0].jobs_admin(timeoutSecs=60)
busy = False
for j in a['jobs']:
if j['end_time']=='' and not (j['cancelled'] or (j['result'].get('val', None)=='CANCELLED')):
Expand Down Expand Up @@ -102,7 +102,7 @@ def pollStatsWhileBusy(timeoutSecs=300, pollTimeoutSecs=15, retryDelaySecs=5):
# what the heck, just look for a match in any of the 3 (no regex)
# if pattern is not None, only stall on jobs that match the pattern (in any of those 3)

def pollWaitJobs(pattern=None, errorIfCancelled=False, timeoutSecs=30, pollTimeoutSecs=30, retryDelaySecs=5, benchmarkLogging=None, stallForNJobs=None):
def pollWaitJobs(pattern=None, errorIfCancelled=False, timeoutSecs=60, pollTimeoutSecs=60, retryDelaySecs=5, benchmarkLogging=None, stallForNJobs=None):
wait = True
waitTime = 0
ignoredJobs = set()
Expand Down
2 changes: 1 addition & 1 deletion py/testdir_release/c4/test_c4_four_billion_rows_fvec.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import unittest, time, sys, random
sys.path.extend(['.','..','../..','py'])
import h2o, h2o_cmd, h2o_hosts, h2o_glm, h2o_browse as h2b, h2o_import as h2i, h2o_common
import h2o, h2o_cmd, h2o_hosts, h2o_glm, h2o_browse as h2b, h2o_import as h2i, h2o_common, h2o_exec as h2e

print "Assumes you ran ../build_for_clone.py in this directory"
print "Using h2o-nodes.json. Also the sandbox dir"
Expand Down
3 changes: 2 additions & 1 deletion py/testdir_release/runner.sh
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,8 @@ then
myPy c9 test_c9_GBM_airlines_hdfs.py
myPy c10 test_c10_rel_gbm.py
# put known failure last
myPy c7 test_c7_rel.py
# doesn't work. key gets locked. forget about it
# myPy c7 test_c7_rel.py

else
myPy $TESTDIR $TEST
Expand Down
3 changes: 2 additions & 1 deletion py/testdir_release/runner2.sh
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,8 @@ then
# myPy c4 test_c4_four_billion_rows.py
myPy c6 test_c6_hdfs.py
# myPy c7 test_c7_rel.py
myPy c7 test_c7_fvec.py
# doesn't work. key gets locked. forget about it
# myPy c7 test_c7_fvec.py

# myPy c8 test_c8_rf_airlines_hdfs.py
myPy c9 test_c9b_GBM_airlines_hdfs.py
Expand Down
3 changes: 2 additions & 1 deletion py/testdir_release/runner_164.sh
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,8 @@ then
# known failure last
myPy c6 test_c6_hdfs.py
# fails with summary. currently disable summary
myPy c7 test_c7_rel.py
# doesn't work. key gets locked. forget about it
# myPy c7 test_c7_rel.py
myPy c8 test_c8_rf_airlines_hdfs.py

myPy c1 test_c1_fvec.py
Expand Down
8 changes: 8 additions & 0 deletions src/main/java/hex/gbm/SharedTreeModelBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,14 @@ public abstract class SharedTreeModelBuilder<TM extends DTree.TreeModel> extends
throw new IllegalArgumentException("Constant response column!");
if (_nclass > MAX_SUPPORTED_LEVELS)
throw new IllegalArgumentException("Too many levels in response column!");

int usableColumns = 0;
for (int i = 0; i < _ncols; i++) {
Vec v = source.vec(i);
if (v.isBad() || v.isConst()) continue;
usableColumns++;
}
if (usableColumns==0) throw new IllegalArgumentException("There is no usable column to generate model!");
}

// --------------------------------------------------------------------------
Expand Down
3 changes: 3 additions & 0 deletions src/main/java/water/fvec/CXDChunk.java
Original file line number Diff line number Diff line change
Expand Up @@ -68,4 +68,7 @@ public Iterator<Value> values(){
});
}

public int pformat_len0() { return 22; }
public String pformat0() { return "% 21.15e"; }

}
8 changes: 8 additions & 0 deletions src/main/java/water/fvec/Vec.java
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,14 @@ public Vec adaptTo(Vec v, boolean exact) {
public final int timeMode(){ return _time; }
public final String timeParse(){ return ParseTime.TIME_PARSE[_time]; }

/** Is the column constant.
* <p>Returns true if the column contains only constant values and it is not full of NAs.</p> */
public final boolean isConst() { return min() == max(); }
/** Is the column bad.
* <p>Returns true if the column is full of NAs.</p>
*/
public final boolean isBad() { return naCnt() == length(); }

/** Map the integer value for a enum/factor/categorical to it's String.
* Error if it is not an ENUM. */
public String domain(long i) { return _domain[(int)i]; }
Expand Down
15 changes: 15 additions & 0 deletions src/test/java/hex/drf/DRFTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,21 @@ abstract static class PrepData { abstract int prep(Frame fr); }
s("3", "4", "5", "6", "8"));
}

@Test(expected=IllegalArgumentException.class)
public void testConstantCols() throws Throwable {
basicDRFTestOOBE(
"./smalldata/poker/poker100","poker.hex",
new PrepData() { @Override int prep(Frame fr) {
for (int i=0; i<7;i++) UKV.remove(fr.remove(4)._key);
return fr.find("C11");
} },
1,
a( a(46294, 202),
a( 3187, 107)),
s("0", "1"));

}

//@Test
public void testCreditSample1() throws Throwable {
basicDRFTestOOBE(
Expand Down

0 comments on commit 3c6315a

Please sign in to comment.