Merge branch 'master' of github.com:0xdata/h2o

nuoncul · Feb 1, 2014 · c4a5aec · c4a5aec
2 parents 6bd4af7 + 133ada9
commit c4a5aec
Show file tree

Hide file tree

Showing 5 changed files with 167 additions and 8 deletions.
diff --git a/R/tests/testdir_jira/runit_NOPASS_pub_168_dfpredicates.R b/R/tests/testdir_jira/runit_NOPASS_pub_168_dfpredicates.R
@@ -0,0 +1,63 @@
+#
+# test filtering via factors
+#
+
+
+setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"-f")))
+source('../findNSourceUtils.R')
+
+
+
+factorfilter <- function(conn){
+  Log.info('uploading ddply testing dataset')
+  df.h <- h2o.importFile(conn, locate('smalldata/jira/pub-180.csv'))
+
+  Log.info('printing from h2o')
+  Log.info( head(df.h) )
+
+  Log.info('subsetting via factor')
+  df.h.1 <- df.h[ df.h$colgroup == 'a', ]
+  expect_that( dim(df.h.1) == c(3,4) )
+
+  df.h.2 <- df.h[ df.h[,2] == "group2", ]
+  expect_that( dim(df.h.2) == c(2, 4) )
+
+  df.h.3 <- df.h[ df.h[,2] == 'group1' & df.h$colgroup == 'c', ]
+  expect_that( dim(df.h.3) == c(1,4) )
+
+  Log.info('localizing')
+  df.1 <- as.data.frame(df.h.1)
+  df.2 <- as.data.frame(df.h.2)
+  df.3 <- as.data.frame(df.h.3)
+
+
+  Log.info('testing')
+  expect_that( dim(df.1) == c(3, 4) )
+  expect_that( unique( df.1[,1] ) == 'a' && unique(df.1[,2]) == 'group1')
+  expect_that(all( df.1[,3] == c(1,2,1) ))
+  expect_that(all( df.1[,4] == c(2,3,2) ))
+
+  expect_that( dim(df.2) == c(2, 4) )
+  expect_that( unique( df.2[,1] ) == 'c' && unique(df.2[,2]) == 'group2')
+  expect_that(all( df.2[,3] == c(5,5) ))
+  expect_that(all( df.2[,4] == c(6,6) ))
+
+  expect_that( dim(df.3) == c(1, 4) )
+  expect_that( df.3[3,1] == 'c' && df.3[1,2] == 'group1' )
+  expect_that( df.3[1,3] == 5 )
+  expect_that( df.3[1,4] == 6 )
+
+  testEnd()
+}
+
+if(F){
+  # R code that does the same as above
+  data <- read.csv(locate('smalldata/jira/pub-180.csv'), header=T)
+
+  data[ data$colgroup == 'a', ]
+  data[ data[,2] == 'group2', ]
+  data[ data[,2] == 'group1' & data$colgroup == 'c', ]
+}
+
+
+doTest('factor filtering', factorfilter)
diff --git a/R/tests/testdir_jira/runit_NOPASS_pub_180_ddply.R b/R/tests/testdir_jira/runit_NOPASS_pub_180_ddply.R
@@ -0,0 +1,86 @@
+#
+# ddply
+#
+
+
+setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"-f")))
+source('../findNSourceUtils.R')
+
+
+
+ddplytest <- function(conn){
+  Log.info('uploading ddply testing dataset')
+  df.h <- h2o.importFile(conn, locate('smalldata/jira/pub-180.csv'))
+
+  Log.info('printing from h2o')
+  Log.info( head(df.h) )
+
+  Log.info('grouping over a single column (equivalent to tapply)')
+  df.h.1 <- ddply(df.h, .(colgroup), function(df){ min(df$col1)} )
+
+  Log.info('grouping over multiple columns (equivalent to tapply with IDX=group1 + group2)')
+  df.h.2 <- ddply(df.h, .(colgroup, colgroup2), function(df){ min(df$col1)} )
+
+  Log.info('single grouping column, use 2 columns')
+  df.h.3 <- ddply(df.h, .(colgroup), function(df){ min(df$col1 + df$col2) } )
+
+  Log.info('grouping multiple columns, use 2 columns')
+  df.h.3 <- ddply(df.h, .(colgroup, colgroup2), function(df){ min(df$col1 + df$col2) } )
+
+  Log.info('pulling data locally')
+  df.1 <- as.data.frame( df.h.1 )
+  df.2 <- as.data.frame( df.h.2 )
+  df.3 <- as.data.frame( df.h.3 )
+  df.4 <- as.data.frame( df.h.4 )
+
+
+  Log.info('testing')
+  expect_that( dim(df.1) == c(3, 2) )
+  expect_that(all( df.1[,1] == c('a', 'b', 'c') ))
+  expect_that(all( df.1[,2] == c(1,3,5) ))
+
+
+  expect_that( dim(df.2) == c(5, 3) )
+  expect_that(all( df.2[,1] == c('a', 'b', 'b', 'c', 'c') ))
+  expect_that(all( df.2[,2] == paste('group', c(1,1,3,1,2), sep='') ))
+  expect_that(all( df.2[,3] == c(1,3,7,5,5) ))
+
+
+  expect_that( dim(df.3) == c(3, 2) )
+  expect_that(all( df.3[,1] == c('a', 'b', 'c') ))
+  expect_that(all( df.3[,2] == c(3,7,11) ))
+
+
+  expect_that( dim(df.4) == c(5, 3) )
+  expect_that(all( df.4[,1] == c('a', 'b', 'b', 'c', 'c') ))
+  expect_that(all( df.4[,2] == paste('group', c(1,1,3,1,2), sep='') ))
+  expect_that(all( df.4[,3] == c(3,7,18,11,11) ))
+
+
+  testEnd()
+}
+
+if(F){
+  # R code that does the same as above
+  library(plyr)
+  data <- read.csv(locate('smalldata/jira/pub-180.csv'), header=T)
+
+  # example 1 in plain R
+  # semantically, these produce much the same thing, although one puts in a dataframe and the other in a named vector
+  # sql GROUP BY colgroup
+  tapply(data$col1, data$colgroup, min)
+  ddply(data, .(colgroup), function(df){min(df$col1)} )
+
+  # example 2 -- equivalent to sql GROUP BY colgroup, colgroup2;
+  tapply(df$col1, paste(df$colgroup,df$colgroup2), min)
+  ddply(data, .(colgroup, colgroup2), function(df){min(df$col1)} )
+
+  # example 3 - can't build with tapply
+  ddply(data, .(colgroup), function(df){ min(df$col1 + df$col2)} )
+
+  # example 4 - can't build with tapply
+  ddply(data, .(colgroup, colgroup2), function(df){ min(df$col1 + df$col2)} )
+}
+
+
+doTest('ddply', ddplytest)
diff --git a/R/tests/testdir_jira/runit_NOPASS_v_3_apply.R b/R/tests/testdir_jira/runit_NOPASS_v_3_apply.R
@@ -2,12 +2,9 @@
 # apply
 #
 
-
 setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"-f")))
 source('../findNSourceUtils.R')
 
-
-
 applytest <- function(conn){
   Log.info('uploading apply testing dataset')
   df.h <- h2o.importFile(conn, locate('smalldata/jira/v-3.csv'))
@@ -18,14 +15,16 @@ applytest <- function(conn){
   Log.info('applying over 1, 2, 1:2')
   df.h.1 <- apply(df.h, 1, function(x){ sum(x) })
   df.h.2 <- apply(df.h, 2, function(x){ sum(x) })
-  df.h.3 <- apply(df.h, 1:2, function(x){ x + 1})
+# While the semantics of apply(,1:2,) are easy (same as map), the syntactic
+# form is annoying to deal with right now.  Dropped, as the alternative
+# forms to get the same job done are easy & supported: df <- df+1
+#  df.h.3 <- apply(df.h, 1:2, function(x){ x + 1})
 
   Log.info('pulling data locally')
   df.1 <- as.data.frame( df.h.1 )
   df.2 <- as.data.frame( df.h.2 )
   df.3 <- as.data.frame( df.h.3 )
 
-
   expect_that(all( df.1[,1] == c(3,7,11) ))
   expect_that(all( df.2[,1] == c(9, 12) ))
   expect_that(all( df.3[,1] == c(2,4,6) ))
@@ -34,6 +33,4 @@ applytest <- function(conn){
   testEnd()
 }
 
-
-
 doTest('apply', applytest)
diff --git a/smalldata/jira/pub-180.csv b/smalldata/jira/pub-180.csv
@@ -0,0 +1,13 @@
+"colgroup", "colgroup2", "col1", "col2"
+"a","group1",1, 2
+"b","group1", 3, 4
+"c","group2", 5, 6
+"b","group3", 7, 11
+"a","group1",2, 3
+"b","group1", 3, 4
+"c","group1", 5, 6
+"b","group3", 7, 11
+"a","group1",1, 2
+"b","group1", 3, 4
+"c","group2", 5, 6
+"b","group3", 7, 11
diff --git a/src/main/java/water/parser/CustomParser.java b/src/main/java/water/parser/CustomParser.java
@@ -114,7 +114,7 @@ public void checkColumnNames(){
           }
         }
         if(!conflictingNames.isEmpty())
-          throw new ParseSetupGuessException("Invalid header. Got conflicting column names. " + conflictingNames.toString(),null,null);
+          throw new ParseSetupGuessException("Column labels must be unique but these labels are repeated:" + conflictingNames.toString(),null,null);
       }
     }
     public ParserSetup clone(){