Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/0xdata/h2o
Browse files Browse the repository at this point in the history
  • Loading branch information
dearirenelang committed May 1, 2014
2 parents 9a33543 + 90c48cf commit c6d87de
Show file tree
Hide file tree
Showing 17 changed files with 99 additions and 72 deletions.
2 changes: 1 addition & 1 deletion R/h2o-package/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ export('h2o.shutdown', 'h2o.init', 'h2o.glm', 'h2o.glm.FV', 'h2o.kmeans', 'h2o.r
'h2o.importFolder', 'h2o.uploadFile', 'h2o.parseRaw', 'h2o.importFile.VA', 'h2o.importFolder.VA',
'h2o.importHDFS.VA', 'h2o.importURL.VA', 'h2o.uploadFile.VA', 'h2o.parseRaw.VA', 'h2o.ls', 'h2o.rm',
'summary.H2OParsedData', 'summary.H2OParsedDataVA', 'screeplot.H2OPCAModel', 'h2o.cut', '.', 'h2o..',
'cbind.H2OParsedData', '[[.H2OParsedData', '[[<-.H2OParsedData', 'unique.H2OParsedData')
'cbind.H2OParsedData', 'unique.H2OParsedData')
exportMethods(
"colnames",
"show",
Expand Down
2 changes: 1 addition & 1 deletion R/h2o-package/R/Algorithms.R
Original file line number Diff line number Diff line change
Expand Up @@ -1201,7 +1201,7 @@ h2o.SpeeDRF <- function(x, y, data, classification=TRUE, validation,
#}

class_names = tail(res$'_domains', 1)[[1]]
result$confusion = .build_cm(res$'cm', class_names)
result$confusion = .build_cm(tail(res$cms, 1)[[1]]$'_arr', class_names)
}

return(result)
Expand Down
31 changes: 11 additions & 20 deletions R/h2o-package/R/Classes.R
Original file line number Diff line number Diff line change
Expand Up @@ -590,26 +590,17 @@ setMethod("$<-", "H2OParsedData", function(x, name, value) {
return(new("H2OParsedData", h2o=x@h2o, key=x@key))
})

`[[.H2OParsedData` <- function(x, ..., exact = TRUE) {
if( missing(x) ) stop('must specify x')
if( !class(x) == 'H2OParsedData') stop('x is the wrong class')

cols <- sapply(as.list(...), function(x) x)
if( length(cols) == 0 )
return(x)
if( length(cols) > 1 ) stop('[[]] may only select one column')
if( ! cols[1] %in% colnames(x) )
return(NULL)

x[, cols]
}
setMethod("[[", "H2OParsedData", function(x, i, exact = TRUE) {
if(missing(i)) return(x)
if(length(i) > 1) stop("[[]] may only select one column")
if(!i %in% colnames(x) ) return(NULL)
x[, i]
})

`[[<-.H2OParsedData` <- function(x, i, j, value) {
if( missing(x) ) stop('must specify x')
if( !inherits(x, 'H2OParsedData')) stop('x is the wrong class')
if( !inherits(value, 'H2OParsedData')) stop('can only append H2O data to H2O data')
if( ncol(value) > 1 ) stop('may only set a single column')
if( nrow(value) != nrow(x) ) stop(sprintf('replacement has %d row, data has %d', nrow(value), nrow(x)))
setMethod("[[<-", "H2OParsedData", function(x, i, value) {
if( !inherits(value, 'H2OParsedData')) stop('Can only append H2O data to H2O data')
if( ncol(value) > 1 ) stop('May only set a single column')
if( nrow(value) != nrow(x) ) stop(sprintf('Replacement has %d row, data has %d', nrow(value), nrow(x)))

mm <- match.call()
col_name <- as.list(i)[[1]]
Expand All @@ -623,7 +614,7 @@ setMethod("$<-", "H2OParsedData", function(x, name, value) {
colnames(x) <- cc
}
x
}
})

# Note: right now, all things must be H2OParsedData
cbind.H2OParsedData <- function(..., deparse.level = 1) {
Expand Down
9 changes: 5 additions & 4 deletions R/h2o-package/man/h2o.SpeeDRF.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ An \code{\linkS4class{H2OParsedData}} object containing the variables in the mod
\item{sample.rate}{
(Optional) Sampling rate for constructing data from which individual trees are grown.
}
\item{oobe}{
\item{oobee}{
(Optional) A logical value indicating whether to calculate the out of bag error estimate.
}
\item{nbins}{
Expand All @@ -64,6 +64,8 @@ An \code{\linkS4class{H2OParsedData}} object containing the variables in the mod
}

\details{
IMPORTANT: Currently, you must initialize H2O with the flag \code{beta = TRUE} in \code{h2o.init} in order to use this method!

This method runs random forest model building on a single node, as opposed to the multi-node implementation in \code{\link{h2o.randomForest.FV}}.
}

Expand All @@ -79,13 +81,12 @@ An object of class \code{\linkS4class{H2OSpeeDRFModel}} with slots key, data, va
}

\seealso{
\code{\linkS4Class{H2OSpeeDRFModel}}, \code{\link{h2o.randomForest}}
\code{\linkS4class{H2OSpeeDRFModel}}, \code{\link{h2o.randomForest}}
}

\examples{
# Run an RF model on iris data
library(h2o)
localH2O = h2o.init(ip = "localhost", port = 54321, startH2O = TRUE)
localH2O = h2o.init(ip = "localhost", port = 54321, startH2O = TRUE, beta = TRUE)
irisPath = system.file("extdata", "iris.csv", package = "h2o")
iris.hex = h2o.importFile(localH2O, path = irisPath, key = "iris.hex")
h2o.SpeeDRF(x = c(2,3,4), y = 5, data = iris.hex, ntree = 50, depth = 100)
Expand Down
4 changes: 2 additions & 2 deletions R/h2o-package/man/h2o.gapStatistic.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ A list containing the following components:
}

\references{
Tibshirani, R., Walther, G. and Hastie, T. (2001). Estimating the number of data clusters via the Gap statistic. \emph{Journal of the Royal Statistical Society B}, \strong{63}, 411423.
Tibshirani, R., Walther, G. and Hastie, T. (2001). Estimating the number of data clusters via the Gap statistic. \emph{Journal of the Royal Statistical Society B}, \strong{63}, 411-423.

Tibshirani, R., Walther, G. and Hastie, T. (2000). Estimating the number of clusters in a dataset via the Gap statistic. Technical Report. Stanford.
}
Expand All @@ -43,7 +43,7 @@ Tibshirani, R., Walther, G. and Hastie, T. (2000). Estimating the number of clus
}
\examples{
library(h2o)
localH2O = h2o.init(ip = "localhost", port = 54321, startH2O = TRUE)
localH2O = h2o.init(ip = "localhost", port = 54321, startH2O = TRUE, beta = TRUE)
irisPath = system.file("extdata", "iris.csv", package = "h2o")
iris.hex = h2o.importFile(localH2O, path = irisPath)
h2o.gapStatistic(iris.hex, K.max = 10, B = 100)
Expand Down
6 changes: 0 additions & 6 deletions py/testdir_multi_jvm/test_NN2_mnist_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,6 @@ def test_NN2_mnist_multi(self):
nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds'

relTol = 0.10 ### 10% relative error is acceptable for Hogwild
h2o_nn.checkLastValidationError(self, nn['neuralnet_model'], inspect['numRows'], expectedErr, relTol, **kwargs)

### Now score using the model, and check the validation error
kwargs = {
'source' : validation_key,
Expand All @@ -90,8 +87,5 @@ def test_NN2_mnist_multi(self):
nnScoreResult = h2o_cmd.runDeepLearningScore(key=parseResult['destination_key'], timeoutSecs=timeoutSecs, **kwargs)
h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs)


h2o.beta_features = False

if __name__ == '__main__':
h2o.unit_main()
11 changes: 9 additions & 2 deletions py/testdir_multi_jvm/test_many_fp_formats_libsvm.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,18 +227,25 @@ def test_many_fp_formats_libsvm (self):
# we may not see the min/max range of values that was bounded by our gen, but
# we can check that it's a subset of the allowed range
if synKey == 'min':
self.assertTrue(syn[synKey] <= cols[synKey],
# can have quoted numbers in json?
self.assertTrue(syn[synKey] <= float(cols[synKey]),
msg='col %s %s %s should be <= %s' % (k, synKey, cols[synKey], syn[synKey]))
elif synKey == 'max':
self.assertTrue(syn[synKey] >= cols[synKey],
# can have quoted numbers in json?
self.assertTrue(syn[synKey] >= float(cols[synKey]),
msg='col %s %s %s should be >= %s' % (k, synKey, cols[synKey], syn[synKey]))
elif synKey == 'variance':
# can have quoted numbers in json?
self.assertTrue(syn[synKey] == float(cols[synKey]),
msg='col %s %s %s should be == %s' % (k, synKey, cols[synKey], syn[synKey]))
elif synKey == 'size' or synKey == 'scale' or synKey == 'type':
if cols[synKey] not in syn[synKey]:
# for debug of why it was a bad size
print "cols size/min/max:", cols['size'], cols['min'], cols['max']
print "syn size/min/max:", syn['size'], syn['min'], syn['max']
raise Exception('col %s %s %s should be in this allowed %s' % (k, synKey, cols[synKey], syn[synKey]))
else:
# can have quoted numbers in json?
self.assertEqual(syn[synKey], cols[synKey],
msg='col %s %s %s should be %s' % (k, synKey, cols[synKey], syn[synKey]))

Expand Down
2 changes: 1 addition & 1 deletion py/testdir_multi_jvm/test_many_fp_formats_libsvm_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ def test_many_fp_formats_libsvm_2(self):

synMean = (v + 0.0)/rowCount
# enums don't have mean, but we're not enums
mean = inspect['cols'][k]['mean']
mean = float(inspect['cols'][k]['mean'])
# our fp formats in the syn generation sometimes only have two places?
self.assertAlmostEqual(mean, synMean, places=0,
msg='col %s mean %0.6f is not equal to generated mean %0.6f' % (k, mean, synMean))
Expand Down
8 changes: 4 additions & 4 deletions py/testdir_multi_jvm/test_parse_bounds_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,10 @@ def test_parse_bounds_csv (self):
iColNameToOffset[iName] = iOffset
# just touching to make sure they are there
num_missing_values = iColDict['num_missing_values']
iMin = iColDict['min']
iMax = iColDict['max']
iMean = iColDict['mean']
iVariance = iColDict['variance']
iMin = float(iColDict['min'])
iMax = float(iColDict['max'])
iMean = float(iColDict['mean'])
iVariance = float(iColDict['variance'])

# SUMMARY********************************
summaryResult = h2o_cmd.runSummary(key=hex_key, max_column_display=colCount, timeoutSecs=timeoutSecs)
Expand Down
7 changes: 5 additions & 2 deletions py/testdir_multi_jvm/test_parse_bounds_libsvm.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,10 +149,13 @@ def test_parse_bounds_libsvm (self):
# definitely not enums
zeros = columns['zeros']
na = columns['na']
print h2o.dump_json(columns)
print columns['max'], columns['min'], columns['mean'], columns['sigma']
# these numbers aren't quoted. array of 5 min, 5 max
smax = columns['max']
smin = columns['min']
mean = columns['mean']
sigma = columns['sigma']
mean = float(columns['mean'])
sigma = float(columns['sigma'])

# a single 1 in the last col
if name == "V" + str(colNumberMax): # h2o puts a "V" prefix
Expand Down
23 changes: 12 additions & 11 deletions py/testdir_multi_jvm/test_parse_libsvm.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ def test_parse_libsvm(self):
# make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
# so probably 10x that for covtype200
csvFilenameList = [
("mnist_train.svm", "cM", 30, 0, 9, False, False),
("covtype.binary.svm", "cC", 30, 1, 2, True, True),
("mnist_train.svm", "cM", 30, 0, 9.0, False, False),
("covtype.binary.svm", "cC", 30, 1, 2.0, True, True),
# multi-label target like 1,2,5 ..not sure what that means
# ("tmc2007_train.svm", "cJ", 30, 0, 21.0, False, False),
# illegal non-ascending cols
Expand All @@ -54,13 +54,13 @@ def test_parse_libsvm(self):
# fails csvDownload
("duke.svm", "cD", 30, -1.000000, 1.000000, False, False),
("colon-cancer.svm", "cA", 30, -1.000000, 1.000000, False, False),
("news20.svm", "cH", 30, 1, 20, False, False),
("connect4.svm", "cB", 30, -1, 1, False, False),
("news20.svm", "cH", 30, 1, 20.0, False, False),
("connect4.svm", "cB", 30, -1, 1.0, False, False),
# too many features? 150K inspect timeout?
# ("E2006.train.svm", "cE", 30, 1, -7.89957807346873 -0.519409526940154, False, False)

("gisette_scale.svm", "cF", 30, -1, 1, False, False),
("mushrooms.svm", "cG", 30, 1, 2, False, False),
("gisette_scale.svm", "cF", 30, -1, 1.0, False, False),
("mushrooms.svm", "cG", 30, 1, 2.0, False, False),
]

### csvFilenameList = random.sample(csvFilenameAll,1)
Expand All @@ -83,14 +83,15 @@ def test_parse_libsvm(self):
h2o_cmd.infoFromInspect(inspectFirst, csvFilename)
# look at the min/max for the target col (0) and compare to expected for the dataset

imin = inspectFirst['cols'][0]['min']
imax = inspectFirst['cols'][0]['max']
imin = float(inspectFirst['cols'][0]['min'])
# print h2o.dump_json(inspectFirst['cols'][0])
imax = float(inspectFirst['cols'][0]['max'])

if expectedCol0Min:
self.assertEqual(imin, expectedCol0Min,
msg='col %s min %s is not equal to expected min %s' % (0, imin, expectedCol0Min))
if expectedCol0Max:
self.assertEqual(imax, expectedCol0Max,
h2o_util.assertApproxEqual(imax, expectedCol0Max, tol=0.00000001,
msg='col %s max %s is not equal to expected max %s' % (0, imax, expectedCol0Max))

print "\nmin/max for col0:", imin, imax
Expand Down Expand Up @@ -164,8 +165,8 @@ def test_parse_libsvm(self):
# make the check conditional based on the dataset
self.assertEqual(row_sizeA, row_sizeB,
"row_size mismatches after re-parse of downloadCsv result %d %d" % (row_sizeA, row_sizeB))
self.assertEqual(value_size_bytesA, value_size_bytesB,
"value_size_bytes mismatches after re-parse of downloadCsv result %d %d" % (value_size_bytesA, value_size_bytesB))
h2o_util.assertApproxEqual(value_size_bytesA, value_size_bytesB, tol=0.00000001,
msg="value_size_bytes mismatches after re-parse of downloadCsv result %d %d" % (value_size_bytesA, value_size_bytesB))

print "missingValuesListA:", missingValuesListA
print "missingValuesListB:", missingValuesListB
Expand Down
16 changes: 9 additions & 7 deletions src/main/java/hex/FrameSplitter.java
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ private Vec[][] makeTemplates(Frame dataset, float[] ratios) {
int[] splits = Utils.partitione(nrows, ratios);
assert splits.length == ratios.length+1 : "Unexpected number of splits";
for (int j=0; j<splits.length; j++) {
assert splits[j] > 0 : "Ups, no rows for " + j + "-th segment!";
//assert splits[j] > 0 : "Ups, no rows for " + j + "-th segment!";
r[j][i+1] = r[j][i] + splits[j]; // previous + current number of rows
}
}
Expand All @@ -116,12 +116,14 @@ private Vec[][] makeTemplates(Frame dataset, float[] ratios) {

private void onDone(boolean exceptional) {
dataset.unlock(jobKey);
for (Frame s : splits) {
if (!exceptional) { // just unlock if everything was ok
s.update(jobKey);
s.unlock(jobKey);
} else { // else delete current half-done results
if (s!=null) s.delete(jobKey,0f);
if (splits!=null) { // if exception is hit before splits array is allocated
for (Frame s : splits) {
if (!exceptional) { // just unlock if everything was ok
s.update(jobKey);
s.unlock(jobKey);
} else { // else delete current half-done results
if (s!=null) s.delete(jobKey,0f);
}
}
}
}
Expand Down
15 changes: 11 additions & 4 deletions src/main/java/water/api/FrameSplitPage.java
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,14 @@ public class FrameSplitPage extends Func {
super.init();
/* Check input parameters */
float sum = 0;
long nrows = source.numRows();
if (nrows <= ratios.length) throw new IllegalArgumentException("Dataset does not have enough row to be split!");
for (int i=0; i<ratios.length; i++) {
if (!(ratios[i] > 0 && ratios[i] < 1)) throw new IllegalArgumentException("Split ration has to be in (0,1) interval!");
if (ratios[i] * nrows <= 1) throw new IllegalArgumentException("Ratio " + ratios[i] + " produces empty frame since the source frame has only " + nrows + "!");
sum += ratios[i];
}
if (sum>1) throw new IllegalArgumentException("Sum of split ratios has to be less or equal to 1!");
if (!(sum<1f)) throw new IllegalArgumentException("Sum of split ratios has to be less than 1!");
}

// Run the function
Expand All @@ -56,7 +59,7 @@ public class FrameSplitPage extends Func {

Frame[] splits = fs.getResult();

split_keys = new Key[splits.length];
split_keys = new Key [splits.length];
split_rows = new long[splits.length];
float rsum = Utils.sum(ratios);
split_ratios = Arrays.copyOf(ratios, splits.length);
Expand All @@ -72,30 +75,34 @@ public class FrameSplitPage extends Func {

@Override public boolean toHTML(StringBuilder sb) {
int nsplits = split_keys.length;
String [] headers = new String[nsplits+1];
String [] headers = new String[nsplits+2];
headers[0] = "";
for(int i=0; i<nsplits; i++) headers[i+1] = "Split #"+i;
headers[nsplits+1] = "Total";
DocGen.HTML.arrayHead(sb, headers);
// Key table row
sb.append("<tr><td>").append(DocGen.HTML.bold("Keys")).append("</td>");
for (int i=0; i<nsplits; i++) {
Key k = split_keys[i];
sb.append("<td>").append(Inspect2.link(k.toString(), k)).append("</td>");
sb.append("<td>").append(Inspect2.link(k)).append("</td>");
}
sb.append("<td>").append(Inspect2.link(source._key)).append("</td>");
sb.append("</tr>");
// Number of rows row
sb.append("<tr><td>").append(DocGen.HTML.bold("Rows")).append("</td>");
for (int i=0; i<nsplits; i++) {
long r = split_rows[i];
sb.append("<td>").append(String.format("%,d", r)).append("</td>");
}
sb.append("<td>").append(String.format("%,d", Utils.sum(split_rows))).append("</td>");
sb.append("</tr>");
// Split ratios
sb.append("<tr><td>").append(DocGen.HTML.bold("Ratios")).append("</td>");
for (int i=0; i<nsplits; i++) {
float r = 100*split_ratios[i];
sb.append("<td>").append(String.format("%.2f %%", r)).append("</td>");
}
sb.append("<td>").append(String.format("%.2f %%", 100*Utils.sum(split_ratios))).append("</td>");
sb.append("</tr>");
DocGen.HTML.arrayTail(sb);
return true;
Expand Down
12 changes: 8 additions & 4 deletions src/main/java/water/api/Inspect.java
Original file line number Diff line number Diff line change
Expand Up @@ -255,10 +255,14 @@ public Response serveValueArray(final ValueArray va, int max_column) {
json.addProperty(SIZE, Math.abs(c._size));
json.addProperty(BASE, c._base);
json.addProperty(SCALE, (int) c._scale);
json.addProperty(MIN, (c.isEnum() || Double.isNaN(c._min)) ? "\"NaN\"" : String.valueOf(c._min));
json.addProperty(MAX, (c.isEnum() || Double.isNaN(c._max)) ? "\"NaN\"" : String.valueOf(c._max));
json.addProperty(MEAN, (c.isEnum() || Double.isNaN(c._mean)) ? "\"NaN\"" : String.valueOf(c._mean));
json.addProperty(VARIANCE, (c.isEnum() || Double.isNaN(c._sigma)) ? "\"NaN\"" : String.valueOf(c._sigma));
if(c.isEnum() || Double.isNaN(c._min)) json.addProperty(MIN, "\"NaN\"");
else json.addProperty(MIN, c._min);
if(c.isEnum() || Double.isNaN(c._max)) json.addProperty(MAX, "\"NaN\"");
else json.addProperty(MAX, c._max);
if(c.isEnum() || Double.isNaN(c._mean)) json.addProperty(MEAN, "\"NaN\"");
else json.addProperty(MEAN, c._mean);
if(c.isEnum() || Double.isNaN(c._sigma)) json.addProperty(VARIANCE, "\"NaN\"");
else json.addProperty(VARIANCE, c._sigma);
json.addProperty(NUM_MISSING_VALUES, va._numrows - c._n);
json.addProperty(TYPE, c.isEnum() ? "enum" : (c.isFloat() ? "float" : "int"));
json.addProperty(ENUM_DOMAIN_SIZE, c.isEnum() ? c._domain.length : 0);
Expand Down
3 changes: 3 additions & 0 deletions src/main/java/water/api/Inspect2.java
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,9 @@ private String infoLink(Key k){
return "<a href='/2/Inspect2.html?src_key=" + k.toString() + "&offset=-1'>info</a>";
}

public static String link(Key k) {
return link(k.toString(), k.toString());
}
public static String link(String txt,Key k) {
return link(txt, k.toString());
}
Expand Down
5 changes: 4 additions & 1 deletion src/main/java/water/util/Utils.java
Original file line number Diff line number Diff line change
Expand Up @@ -1458,11 +1458,14 @@ public static final int[] partitione(int len, float[] ratio) {
int[] r = new int[ratio.length+1];
int sum = 0;
int i = 0;
float sr = 0;
for (i=0; i<ratio.length; i++) {
r[i] = (int) (ratio[i]*len);
sum += r[i];
sr += ratio[i];
}
r[i] = len - sum;
if (sr<1f) r[i] = len - sum;
else r[i-1] += (len-sum);
return r;
}

Expand Down
Loading

0 comments on commit c6d87de

Please sign in to comment.