Skip to content

Commit

Permalink
Merge branch 'master' of github.com:0xdata/h2o
Browse files Browse the repository at this point in the history
  • Loading branch information
Kevin Normoyle committed Jan 8, 2014
2 parents 9282354 + b2a3ec9 commit a3ab869
Show file tree
Hide file tree
Showing 10 changed files with 188 additions and 78 deletions.
98 changes: 68 additions & 30 deletions R/h2oRClient-package/R/Classes.R
Original file line number Diff line number Diff line change
Expand Up @@ -558,41 +558,41 @@ setMethod("as.data.frame", "H2OParsedData", function(x) {

# Substitute NAs for blank cells rather than skipping.
df = read.csv(textConnection(ttt), blank.lines.skip = FALSE)
return(df)
})

setMethod("head", "H2OParsedData", function(x, n = 6L, ...) {
if(n == 0 || !is.numeric(n)) stop("n must be a non-zero integer")
n = round(n)
# if(abs(n) > nrow(x)) stop(paste("n must be between 1 and", nrow(x)))
numRows = nrow(x)
if(n < 0 && abs(n) >= numRows) return(data.frame())
myView = ifelse(n > 0, min(n, numRows), numRows+n)
if(myView > MAX_INSPECT_VIEW) stop(paste("Cannot view more than", MAX_INSPECT_VIEW, "rows"))

res = h2o.__remoteSend(x@h2o, h2o.__PAGE_INSPECT, key=x@key, offset=0, view=myView)
temp = unlist(lapply(res$rows, function(y) { y$row = NULL; y }))
if(is.null(temp)) return(temp)
x.df = data.frame(matrix(temp, nrow = myView, byrow = TRUE))
colnames(x.df) = unlist(lapply(res$cols, function(y) y$name))
x.df
stopifnot(length(n) == 1L)
n <- ifelse(n < 0L, max(numRows + n, 0L), min(n, numRows))
if(n == 0) return(data.frame())

x.slice = as.data.frame(x[seq_len(n),])
res = h2o.__remoteSend(x@h2o, h2o.__PAGE_SUMMARY2, source = x@key)
x.lev = lapply(res$summaries, function(x) { x$hbrk })
for(i in 1:ncol(x)) {
if(res$summaries[[i]]$stats$type == 'Enum')
x.slice[,i] <- factor(x.slice[,i], levels = res$summaries[[i]]$hbrk)
}
return(x.slice)
})

setMethod("tail", "H2OParsedData", function(x, n = 6L, ...) {
if(n == 0 || !is.numeric(n)) stop("n must be a non-zero integer")
n = round(n)
# if(abs(n) > nrow(x)) stop(paste("n must be between 1 and", nrow(x)))
numRows = nrow(x)
if(n < 0 && abs(n) >= numRows) return(data.frame())
myOff = ifelse(n > 0, max(0, numRows-n), abs(n))
myView = ifelse(n > 0, min(n, numRows), numRows+n)
if(myView > MAX_INSPECT_VIEW) stop(paste("Cannot view more than", MAX_INSPECT_VIEW, "rows"))

res = h2o.__remoteSend(x@h2o, h2o.__PAGE_INSPECT, key=x@key, offset=myOff, view=myView)
temp = unlist(lapply(res$rows, function(y) { y$row = NULL; y }))
if(is.null(temp)) return(temp)
x.df = data.frame(matrix(temp, nrow = myView, byrow = TRUE))
colnames(x.df) = unlist(lapply(res$cols, function(y) y$name))
x.df
stopifnot(length(n) == 1L)
nrx <- nrow(x)
n <- ifelse(n < 0L, max(nrx + n, 0L), min(n, nrx))
if(n == 0) return(data.frame())

idx = seq.int(to = nrx, length.out = n)
x.slice = as.data.frame(x[idx,])
rownames(x.slice) = idx
res = h2o.__remoteSend(x@h2o, h2o.__PAGE_SUMMARY2, source = x@key)
x.lev = lapply(res$summaries, function(x) { x$hbrk })
for(i in 1:ncol(x)) {
if(res$summaries[[i]]$stats$type == 'Enum')
x.slice[,i] <- factor(x.slice[,i], levels = res$summaries[[i]]$hbrk)
}
return(x.slice)
})

setMethod("as.factor", "H2OParsedData", function(x) { h2o.__unop2("factor", x) })
Expand Down Expand Up @@ -822,12 +822,50 @@ setMethod("dim", "H2OParsedDataVA", function(x) {
as.numeric(c(res$num_rows, res$num_cols))
})

setMethod("length", "H2OParsedData", function(x) { ncol(x) })

setMethod("head", "H2OParsedDataVA", function(x, n = 6L, ...) {
head(new("H2OParsedData", h2o=x@h2o, key=x@key), n, ...)
numRows = nrow(x)
stopifnot(length(n) == 1L)
n <- ifelse(n < 0L, max(numRows + n, 0L), min(n, numRows))
if(n == 0) return(data.frame())
if(n > MAX_INSPECT_VIEW) stop(paste("Cannot view more than", MAX_INSPECT_VIEW, "rows"))

res = h2o.__remoteSend(x@h2o, h2o.__PAGE_INSPECT, key=x@key, offset=0, view=n)
temp = lapply(res$rows, function(y) { y$row = NULL; as.data.frame(y) })
if(is.null(temp)) return(temp)
x.slice = do.call(rbind, temp)

res2 = h2o.__remoteSend(x@h2o, h2o.__PAGE_SUMMARY2, source = x@key)
x.lev = lapply(res2$summaries, function(x) { x$hbrk })
for(i in 1:ncol(x)) {
if(res2$summaries[[i]]$stats$type == 'Enum')
x.slice[,i] <- factor(x.slice[,i], levels = res2$summaries[[i]]$hbrk)
}
return(x.slice)
})

setMethod("tail", "H2OParsedDataVA", function(x, n = 6L, ...) {
tail(new("H2OParsedData", h2o=x@h2o, key=x@key), n, ...)
stopifnot(length(n) == 1L)
nrx <- nrow(x)
n <- ifelse(n < 0L, max(nrx + n, 0L), min(n, nrx))
if(n == 0) return(data.frame())
if(n > MAX_INSPECT_VIEW) stop(paste("Cannot view more than", MAX_INSPECT_VIEW, "rows"))

idx = seq.int(to = nrx, length.out = n)
res = h2o.__remoteSend(x@h2o, h2o.__PAGE_INSPECT, key=x@key, offset=idx[1], view=length(idx))
temp = lapply(res$rows, function(y) { y$row = NULL; as.data.frame(y) })
if(is.null(temp)) return(temp)
x.slice = do.call(rbind, temp)
rownames(x.slice) = idx

res2 = h2o.__remoteSend(x@h2o, h2o.__PAGE_SUMMARY2, source = x@key)
x.lev = lapply(res2$summaries, function(x) { x$hbrk })
for(i in 1:ncol(x)) {
if(res2$summaries[[i]]$stats$type == 'Enum')
x.slice[,i] <- factor(x.slice[,i], levels = res2$summaries[[i]]$hbrk)
}
return(x.slice)
})

setMethod("summary", "H2OParsedDataVA", function(object) {
Expand Down
6 changes: 3 additions & 3 deletions R/h2oRClient-package/demo/h2o.kmeans.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,16 @@ if("fpc" %in% rownames(installed.packages())) {
library(fpc)

par(mfrow=c(1,1))
plotcluster(prostate.data, prostate.clus$predict)
plotcluster(prostate.data, prostate.clus[,1])
title("K-Means Classification for k = 10")
}

# if(!"cluster" %in% rownames(installed.packages())) install.packages("cluster")
if("cluster" %in% rownames(installed.packages())) {
library(cluster)
clusplot(prostate.data, prostate.clus$predict, color = TRUE, shade = TRUE)
clusplot(prostate.data, prostate.clus[,1], color = TRUE, shade = TRUE)
}
pairs(prostate.data[,c(2,3,7,8)], col=prostate.clus$predict)
pairs(prostate.data[,c(2,3,7,8)], col=prostate.clus[,1])

# Plot k-means centers
par(mfrow = c(1,2))
Expand Down
43 changes: 43 additions & 0 deletions R/tests/testdir_golden/runit_gbm_1_golden.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"-f")))
source('../findNSourceUtils.R')

test.gbmMSEgauss.golden <- function(H2Oserver) {

#Import data:
Log.info("Importing smtrees data...")
smtreesH2O<- h2o.uploadFile(H2Oserver, locate("../../smalldata/smtrees.csv"), key="smtreesH2O")
smtreesR<- read.csv(locate("../../smalldata/smtrees.csv"))

#Log.info("Test H2O generation of MSE for GBM")
fith2o<- h2o.gbm(x=c("girth", "height"), y="vol", n.trees=10, interaction.depth=1, distribution="gaussian", n.minobsinnode=2, shrinkage=.1, data=smtreesH2O)

#Reported MSE from H2O through R
err<- as.data.frame(fith2o@model$err)
REPMSE<- err[11,]

#MSE Calculated by hand From H2O predicted values
pred<- as.data.frame(h2o.predict(fith2o, newdata=smtreesH2O))
diff<- pred-smtreesR[,4]
diff<- diff[-1,]
diffsq<- diff^2
EXPMSE<- mean(diffsq)

Log.info("Print model MSE... \n")
Log.info(paste("Length of H2O MSE Vec: ", length(fith2o@model$err), "\t\t", "Expected Length : ", 10))
Log.info(paste("H2O Reported MSE : ", REPMSE, "\t\t", "R Expected MSE : ", EXPMSE))

Log.info("Compare model statistics in R to model statistics in H2O")
expect_equal(length(fith2o@model$err), 10)
expect_equal(REPMSE, EXPMSE, tolerance = 0.01)

testEnd()
}

doTest("GBM Test: Golden GBM - MSE for GBM Regression", test.gbmMSEgauss.golden)







2 changes: 1 addition & 1 deletion prj.el
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
'(jde-run-option-debug nil)
'(jde-run-option-vm-args (quote ("-XX:+PrintGC")))
'(jde-compile-option-directory "./target/classes")
'(jde-run-option-application-args (quote ("-beta" "-mainClass" "org.junit.runner.JUnitCore" "hex.gbm.GBMTest")))
'(jde-run-option-application-args (quote ("-beta" "-mainClass" "hex.drf.Runner")))
'(jde-debugger (quote ("JDEbug")))
'(jde-compile-option-source (quote ("1.6")))
'(jde-compile-option-classpath (quote ("./target/classes" "./lib/javassist.jar" "./lib/hadoop/cdh4/hadoop-common.jar" "./lib/hadoop/cdh4/hadoop-auth.jar" "./lib/hadoop/cdh4/slf4j-api-1.6.1.jar" "./lib/hadoop/cdh4/slf4j-nop-1.6.1.jar" "./lib/hadoop/cdh4/hadoop-hdfs.jar" "./lib/hadoop/cdh4/protobuf-java-2.4.0a.jar" "./lib/apache/commons-codec-1.4.jar" "./lib/apache/commons-configuration-1.6.jar" "./lib/apache/commons-lang-2.4.jar" "./lib/apache/commons-logging-1.1.1.jar" "./lib/apache/httpclient-4.1.1.jar" "./lib/apache/httpcore-4.1.jar" "./lib/junit/junit-4.11.jar" "./lib/apache/guava-12.0.1.jar" "./lib/gson/gson-2.2.2.jar" "./lib/poi/poi-3.8-20120326.jar" "./lib/poi/poi-ooxml-3.8-20120326.jar" "./lib/poi/poi-ooxml-schemas-3.8-20120326.jar" "./lib/poi/dom4j-1.6.1.jar" "./lib/Jama/Jama.jar" "./lib/s3/aws-java-sdk-1.3.27.jar" "./lib/log4j/log4j-1.2.15.jar")))
Expand Down
Binary file not shown.
11 changes: 11 additions & 0 deletions smalldata/smtrees.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
"","girth","height","vol"
"1",17.9,85,51.5
"2",14.2,83,21.4
"3",8.6,70,15.6
"4",11.7,82,77
"5",20.6,81,24.2
"6",8.3,77,42.6
"7",18,78,24.9
"8",18,81,55.7
"9",16,76,31.7
"10",17.3,80,21
2 changes: 1 addition & 1 deletion src/main/java/water/fvec/ParseDataset2.java
Original file line number Diff line number Diff line change
Expand Up @@ -421,7 +421,7 @@ public MultiFileParseTask dfork(Key... keys){
// Zipped file; no parallel decompression;
ParseProgressMonitor pmon = new ParseProgressMonitor(_progress);
_dout = streamParse(new GZIPInputStream(vec.openStream(pmon)),localSetup,_vecIdStart, chunkStartIdx,pmon);
// set this node as the one which rpocessed all the chunks
// set this node as the one which processed all the chunks
for(int i = 0; i < vec.nChunks(); ++i)
_chunk2Enum[chunkStartIdx + i] = H2O.SELF.index();
break;
Expand Down
38 changes: 12 additions & 26 deletions src/main/java/water/fvec/Vec.java
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
package water.fvec;

import java.util.Arrays;
import java.util.UUID;

import water.*;
import water.H2O.H2OCallback;
import water.H2O.H2OCountedCompleter;
import water.H2O.H2OEmptyCompleter;
import water.util.Utils;
import static water.util.Utils.seq;

Expand Down Expand Up @@ -362,7 +358,7 @@ public Key chunkKey(int cidx ) {
return Key.make(bits);
}
/** Get a Chunk's Value by index. Basically the index-to-key map,
* plus the {@link DKV.get}. Warning: this pulls the data locally;
* plus the {@code DKV.get()}. Warning: this pulls the data locally;
* using this call on every Chunk index on the same node will
* probably trigger an OOM! */
public Value chunkIdx( int cidx ) {
Expand All @@ -373,7 +369,7 @@ public Value chunkIdx( int cidx ) {

protected boolean checkMissing(int cidx, Value val) {
if( val != null ) return true;
assert val != null : "Missing chunk "+cidx+" for "+_key;
System.out.println("Missing chunk "+cidx+" for "+_key);
return false;
}

Expand Down Expand Up @@ -422,8 +418,9 @@ public Chunk elem2BV( int cidx ) {
long start = chunk2StartElem(cidx); // Chunk# to chunk starting element#
Value dvec = chunkIdx(cidx); // Chunk# to chunk data
Chunk c = dvec.get(); // Chunk data to compression wrapper
if( c._start == start ) return c; // Already filled-in
assert c._start == -1 || c._start == start; // Second term in case multi-thread access
long cstart = c._start; // Read once, since racily filled in
if( cstart == start ) return c; // Already filled-in
assert cstart == -1; // Was not filled in (everybody racily writes the same start value)
c._start = start; // Fields not filled in by unpacking from Value
c._vec = this; // Fields not filled in by unpacking from Value
return c;
Expand All @@ -433,14 +430,6 @@ public final Chunk chunk( long i ) {
return elem2BV(elem2ChunkIdx(i));
}

/** Next Chunk from the current one. */
final Chunk nextBV( Chunk bv ) {
int cidx = bv.cidx()+1;
Chunk next = cidx == nChunks() ? null : elem2BV(cidx);
assert next == null || next.cidx() == cidx;
return next;
}

/** Fetch element the slow way, as a long. Floating point values are
* silently rounded to an integer. Throws if the value is missing. */
public final long at8( long i ) { return chunk(i).at8(i); }
Expand Down Expand Up @@ -483,8 +472,7 @@ public void remove( Futures fs ) {
}

@Override public boolean equals( Object o ) {
if( !(o instanceof Vec) ) return false;
return ((Vec)o)._key.equals(_key);
return o instanceof Vec && ((Vec)o)._key.equals(_key);
}
@Override public int hashCode() { return _key.hashCode(); }

Expand Down Expand Up @@ -565,10 +553,10 @@ public int reserveKeys(final int n){
}
/**
* Gets the next n keys of this group.
* Performs atomic udpate of the group object to assure we get unique keys.
* The group size will be udpated by adding n.
* Performs atomic update of the group object to assure we get unique keys.
* The group size will be updated by adding n.
*
* @param n
* @param n number of keys to make
* @return arrays of unique keys belonging to this group.
*/
public Key [] addVecs(final int n){
Expand All @@ -585,8 +573,7 @@ public int reserveKeys(final int n){
}

@Override public boolean equals( Object o ) {
if( !(o instanceof VectorGroup) ) return false;
return ((VectorGroup)o)._key.equals(_key);
return o instanceof VectorGroup && ((VectorGroup)o)._key.equals(_key);
}
@Override public int hashCode() {
return _key.hashCode();
Expand All @@ -603,11 +590,10 @@ public static class CollectDomain extends MRTask2<CollectDomain> {

public CollectDomain(Vec v) { _ymin = (int) v.min(); _nclass = (int)(v.max()-_ymin+1); }
@Override public void map(Chunk ys) {
int ycls=0;
for( int row=0; row<ys._len; row++ ) {
if (ys.isNA0(row)) continue;
ycls = (int)ys.at80(row)-_ymin;
_dom[ycls] = 1; // Only write to shared array
int ycls = (int)ys.at80(row)-_ymin;
if( _dom[ycls] == 0 ) _dom[ycls] = 1; // Only write to shared array
}
}

Expand Down
5 changes: 4 additions & 1 deletion src/main/java/water/parser/CsvParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,10 @@ public CsvParser clone(){
dout.rollbackLine();
// If offset is still validly within the buffer, save it so the next pass
// can start from there.
if( offset+1 < bits.length ) din.setChunkDataStart(cidx+1, offset+1 );
if( offset+1 < bits.length ) {
if( state == EXPECT_COND_LF && bits[offset+1] == CHAR_LF ) offset++;
if( offset+1 < bits.length ) din.setChunkDataStart(cidx+1, offset+1 );
}
return dout;
}

Expand Down
Loading

0 comments on commit a3ab869

Please sign in to comment.