Skip to content

Commit

Permalink
Merge branch 'master' of github.com:0xdata/h2o
Browse files Browse the repository at this point in the history
  • Loading branch information
Kevin Normoyle committed Feb 1, 2014
2 parents 6bd4af7 + 133ada9 commit c4a5aec
Show file tree
Hide file tree
Showing 5 changed files with 167 additions and 8 deletions.
63 changes: 63 additions & 0 deletions R/tests/testdir_jira/runit_NOPASS_pub_168_dfpredicates.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#
# test filtering via factors
#


setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"-f")))
source('../findNSourceUtils.R')



factorfilter <- function(conn){
Log.info('uploading ddply testing dataset')
df.h <- h2o.importFile(conn, locate('smalldata/jira/pub-180.csv'))

Log.info('printing from h2o')
Log.info( head(df.h) )

Log.info('subsetting via factor')
df.h.1 <- df.h[ df.h$colgroup == 'a', ]
expect_that( dim(df.h.1) == c(3,4) )

df.h.2 <- df.h[ df.h[,2] == "group2", ]
expect_that( dim(df.h.2) == c(2, 4) )

df.h.3 <- df.h[ df.h[,2] == 'group1' & df.h$colgroup == 'c', ]
expect_that( dim(df.h.3) == c(1,4) )

Log.info('localizing')
df.1 <- as.data.frame(df.h.1)
df.2 <- as.data.frame(df.h.2)
df.3 <- as.data.frame(df.h.3)


Log.info('testing')
expect_that( dim(df.1) == c(3, 4) )
expect_that( unique( df.1[,1] ) == 'a' && unique(df.1[,2]) == 'group1')
expect_that(all( df.1[,3] == c(1,2,1) ))
expect_that(all( df.1[,4] == c(2,3,2) ))

expect_that( dim(df.2) == c(2, 4) )
expect_that( unique( df.2[,1] ) == 'c' && unique(df.2[,2]) == 'group2')
expect_that(all( df.2[,3] == c(5,5) ))
expect_that(all( df.2[,4] == c(6,6) ))

expect_that( dim(df.3) == c(1, 4) )
expect_that( df.3[3,1] == 'c' && df.3[1,2] == 'group1' )
expect_that( df.3[1,3] == 5 )
expect_that( df.3[1,4] == 6 )

testEnd()
}

if(F){
# R code that does the same as above
data <- read.csv(locate('smalldata/jira/pub-180.csv'), header=T)

data[ data$colgroup == 'a', ]
data[ data[,2] == 'group2', ]
data[ data[,2] == 'group1' & data$colgroup == 'c', ]
}


doTest('factor filtering', factorfilter)
86 changes: 86 additions & 0 deletions R/tests/testdir_jira/runit_NOPASS_pub_180_ddply.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#
# ddply
#


setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"-f")))
source('../findNSourceUtils.R')



ddplytest <- function(conn){
Log.info('uploading ddply testing dataset')
df.h <- h2o.importFile(conn, locate('smalldata/jira/pub-180.csv'))

Log.info('printing from h2o')
Log.info( head(df.h) )

Log.info('grouping over a single column (equivalent to tapply)')
df.h.1 <- ddply(df.h, .(colgroup), function(df){ min(df$col1)} )

Log.info('grouping over multiple columns (equivalent to tapply with IDX=group1 + group2)')
df.h.2 <- ddply(df.h, .(colgroup, colgroup2), function(df){ min(df$col1)} )

Log.info('single grouping column, use 2 columns')
df.h.3 <- ddply(df.h, .(colgroup), function(df){ min(df$col1 + df$col2) } )

Log.info('grouping multiple columns, use 2 columns')
df.h.3 <- ddply(df.h, .(colgroup, colgroup2), function(df){ min(df$col1 + df$col2) } )

Log.info('pulling data locally')
df.1 <- as.data.frame( df.h.1 )
df.2 <- as.data.frame( df.h.2 )
df.3 <- as.data.frame( df.h.3 )
df.4 <- as.data.frame( df.h.4 )


Log.info('testing')
expect_that( dim(df.1) == c(3, 2) )
expect_that(all( df.1[,1] == c('a', 'b', 'c') ))
expect_that(all( df.1[,2] == c(1,3,5) ))


expect_that( dim(df.2) == c(5, 3) )
expect_that(all( df.2[,1] == c('a', 'b', 'b', 'c', 'c') ))
expect_that(all( df.2[,2] == paste('group', c(1,1,3,1,2), sep='') ))
expect_that(all( df.2[,3] == c(1,3,7,5,5) ))


expect_that( dim(df.3) == c(3, 2) )
expect_that(all( df.3[,1] == c('a', 'b', 'c') ))
expect_that(all( df.3[,2] == c(3,7,11) ))


expect_that( dim(df.4) == c(5, 3) )
expect_that(all( df.4[,1] == c('a', 'b', 'b', 'c', 'c') ))
expect_that(all( df.4[,2] == paste('group', c(1,1,3,1,2), sep='') ))
expect_that(all( df.4[,3] == c(3,7,18,11,11) ))


testEnd()
}

if(F){
# R code that does the same as above
library(plyr)
data <- read.csv(locate('smalldata/jira/pub-180.csv'), header=T)

# example 1 in plain R
# semantically, these produce much the same thing, although one puts in a dataframe and the other in a named vector
# sql GROUP BY colgroup
tapply(data$col1, data$colgroup, min)
ddply(data, .(colgroup), function(df){min(df$col1)} )

# example 2 -- equivalent to sql GROUP BY colgroup, colgroup2;
tapply(df$col1, paste(df$colgroup,df$colgroup2), min)
ddply(data, .(colgroup, colgroup2), function(df){min(df$col1)} )

# example 3 - can't build with tapply
ddply(data, .(colgroup), function(df){ min(df$col1 + df$col2)} )

# example 4 - can't build with tapply
ddply(data, .(colgroup, colgroup2), function(df){ min(df$col1 + df$col2)} )
}


doTest('ddply', ddplytest)
11 changes: 4 additions & 7 deletions R/tests/testdir_jira/runit_NOPASS_v_3_apply.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,9 @@
# apply
#


setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"-f")))
source('../findNSourceUtils.R')



applytest <- function(conn){
Log.info('uploading apply testing dataset')
df.h <- h2o.importFile(conn, locate('smalldata/jira/v-3.csv'))
Expand All @@ -18,14 +15,16 @@ applytest <- function(conn){
Log.info('applying over 1, 2, 1:2')
df.h.1 <- apply(df.h, 1, function(x){ sum(x) })
df.h.2 <- apply(df.h, 2, function(x){ sum(x) })
df.h.3 <- apply(df.h, 1:2, function(x){ x + 1})
# While the semantics of apply(,1:2,) are easy (same as map), the syntactic
# form is annoying to deal with right now. Dropped, as the alternative
# forms to get the same job done are easy & supported: df <- df+1
# df.h.3 <- apply(df.h, 1:2, function(x){ x + 1})

Log.info('pulling data locally')
df.1 <- as.data.frame( df.h.1 )
df.2 <- as.data.frame( df.h.2 )
df.3 <- as.data.frame( df.h.3 )


expect_that(all( df.1[,1] == c(3,7,11) ))
expect_that(all( df.2[,1] == c(9, 12) ))
expect_that(all( df.3[,1] == c(2,4,6) ))
Expand All @@ -34,6 +33,4 @@ applytest <- function(conn){
testEnd()
}



doTest('apply', applytest)
13 changes: 13 additions & 0 deletions smalldata/jira/pub-180.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
"colgroup", "colgroup2", "col1", "col2"
"a","group1",1, 2
"b","group1", 3, 4
"c","group2", 5, 6
"b","group3", 7, 11
"a","group1",2, 3
"b","group1", 3, 4
"c","group1", 5, 6
"b","group3", 7, 11
"a","group1",1, 2
"b","group1", 3, 4
"c","group2", 5, 6
"b","group3", 7, 11
2 changes: 1 addition & 1 deletion src/main/java/water/parser/CustomParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ public void checkColumnNames(){
}
}
if(!conflictingNames.isEmpty())
throw new ParseSetupGuessException("Invalid header. Got conflicting column names. " + conflictingNames.toString(),null,null);
throw new ParseSetupGuessException("Column labels must be unique but these labels are repeated:" + conflictingNames.toString(),null,null);
}
}
public ParserSetup clone(){
Expand Down

0 comments on commit c4a5aec

Please sign in to comment.