h2o.impute commit

BandoSamurai · Oct 2, 2014 · 87127a8 · 87127a8
1 parent bead377
commit 87127a8
Show file tree

Hide file tree

Showing 14 changed files with 1,658 additions and 550 deletions.
diff --git a/R/h2o-package/R/Classes.R b/R/h2o-package/R/Classes.R
@@ -519,7 +519,16 @@ h2o.table <- function(x, return.in.R = FALSE) {
   return(tb)
 }
 
-h2o.ddply <- function (.data, .variables, .fun = NULL, ..., .progress = 'none') {
+ddply <- function (.data, .variables, .fun = NULL, ..., .progress = "none",
+             .inform = FALSE, .drop = TRUE, .parallel = FALSE, .paropts = NULL) {
+             if (inherits(.data, "H2OParsedData")) UseMethod("ddply")
+             else plyr::ddply(.data, .variables, .fun, ..., .progress, .inform, .drop, .parallel, .paraopts) }
+
+ddply.H2OParsedData <- function (.data, .variables, .fun = NULL, ..., .progress = "none",
+                                 .inform = FALSE, .drop = TRUE, .parallel = FALSE, .paropts = NULL) {
+
+  # .inform, .drop, .parallel, .paropts are all ignored inputs.
+
   if(missing(.data)) stop('must specify .data')
   if(class(.data) != "H2OParsedData") stop('.data must be an H2OParsedData object')
   if( missing(.variables) ) stop('must specify .variables')
@@ -560,7 +569,6 @@ h2o.ddply <- function (.data, .variables, .fun = NULL, ..., .progress = 'none')
   res <- .h2o.__exec2(.data@h2o, exec_cmd)
   .h2o.exec2(res$dest_key, h2o = .data@h2o, res$dest_key)
 }
-ddply <- h2o.ddply
 
 # TODO: how to avoid masking plyr?
 `h2o..` <- function(...) {
@@ -571,6 +579,81 @@ ddply <- h2o.ddply
 
 `.` <- `h2o..`
 
+#'
+#' Impute Missing Values
+#'
+#' Impute the missing values in the data `column` belonging to the dataset `data`.
+#'
+#' Possible values for `method`:  "mean", "median", "reg", "RF"
+#'
+#' If `groupBy` is NULL, then for `mean` and `median`, missing values are imputed using the column mean/median.
+#' For `reg` and `RF`, all columns except for `column` are used in the regression/RF fit.
+#'
+#' If `groupBy` is not NULL, then for `mean` and `median`, the missing values are imputed using the mean/median of
+#' `column` within the groups formed by the groupBy columns.
+#' For `reg` and `RF`, the groupBy variables are the input variables to the regression/RF fit.
+#'
+#' If the column is non-numeric and the method selected is "reg", an error will be produced.
+h2o.impute <- function(data, column, method = "mean", groupBy = NULL) {
+  # possible methods: "mean", "median", "reg", "RF"
+  # what happens when a grouping has only NA values ? -> default to "method" for the unimputed column.
+  stopifnot(!missing(data))
+  stopifnot(!missing(column))
+  stopifnot(method %in% c("mean", "median", "mode"))
+#  if (!is.null(groupBy)) stopifnot(any(groupBy <= 0))
+  stopifnot(inherits(data, "H2OParsedData"))
+
+  .data <- data
+  .variables <- groupBy
+  idx <- NULL
+  if (!is.null(.variables)) {
+  # we accept eg .(col1, col2), c('col1', 'col2'), 1:2, c(1,2)
+    # as column names.  This is a bit complicated
+    if( class(.variables) == 'character'){
+      vars <- .variables
+      idx <- match(vars, colnames(.data))
+    } else if( class(.variables) == 'H2Oquoted' ){
+      vars <- as.character(.variables)
+      idx <- match(vars, colnames(.data))
+    } else if( class(.variables) == 'quoted' ){ # plyr overwrote our . fn
+      vars <- names(.variables)
+      idx <- match(vars, colnames(.data))
+    } else if( class(.variables) == 'integer' ){
+      vars <- .variables
+      idx <- .variables
+    } else if( class(.variables) == 'numeric' ){   # this will happen eg c(1,2,3)
+      vars <- .variables
+      idx <- as.integer(.variables)
+    }
+    bad <- is.na(idx) | idx < 1 | idx > ncol(.data)
+    if( any(bad) ) stop( sprintf('can\'t recognize .variables %s', paste(vars[bad], sep=',')) )
+    idx <- idx - 1
+  }
+
+  col_idx <- NULL
+  if( class(column) == 'character'){
+    vars <- column
+    col_idx <- match(vars, colnames(.data))
+  } else if( class(column) == 'H2Oquoted' ){
+    vars <- as.character(column)
+    col_idx <- match(vars, colnames(.data))
+  } else if( class(column) == 'quoted' ){ # plyr overwrote our . fn
+    vars <- names(column)
+    col_idx <- match(vars, colnames(.data))
+  } else if( class(column) == 'integer' ){
+    vars <- column
+    col_idx <- column
+  } else if( class(column) == 'numeric' ){   # this will happen eg c(1,2,3)
+    vars <- column
+    col_idx <- as.integer(column)
+  }
+  bad <- is.na(col_idx) | col_idx < 1 | col_idx > ncol(.data)
+  if( any(bad) ) stop( sprintf('can\'t recognize column %s', paste(vars[bad], sep=',')) )
+  if (length(col_idx) > 1) stop("Only allows imputation of a single column at a time!")
+  #x@h2o, .h2o.__HACK_SETCOLNAMES2, source=x@key, cols=numCols, comma_separated_list=name)
+  invisible(.h2o.__remoteSend(data@h2o, .h2o.__PAGE_IMPUTE, source=data@key, column=col_idx-1, method=method, group_by=idx))
+}
+
 h2o.addFunction <- function(object, fun, name){
   if( missing(object) || class(object) != 'H2OClient' ) stop('must specify h2o connection in object')
   if( missing(fun) ) stop('must specify fun')
@@ -960,8 +1043,12 @@ setMethod("floor",   "H2OParsedData", function(x) { .h2o.__unop2("floor", x) })
 setMethod("trunc",   "H2OParsedData", function(x) { .h2o.__unop2("trunc", x) })
 setMethod("log",     "H2OParsedData", function(x) { .h2o.__unop2("log",   x) })
 setMethod("exp",     "H2OParsedData", function(x) { .h2o.__unop2("exp",   x) })
-setMethod("is.na",   "H2OParsedData", function(x) { .h2o.__unop2("is.na", x) })
+setMethod("is.na",   "H2OParsedData", function(x) {
+  res <- .h2o.__unop2("is.na", x)
+#  res <- as.numeric(res)
+})
 setMethod("t",       "H2OParsedData", function(x) { .h2o.__unop2("t",     x) })
+setMethod("as.numeric", "H2OParsedData", function(x) { .h2o.__unop2("as.numeric", x) })
 
 round.H2OParsedData <- function(x, digits = 0) {
   if(length(digits) > 1 || !is.numeric(digits)) stop("digits must be a single number")
@@ -1367,6 +1454,23 @@ function (test, yes, no)
     ans
 }
 
+#.getDomainMapping2 <- function(l, s = "") {
+# if (is.list(l)) {
+#   return( .getDomainMapping2( l[[length(l)]], s))
+# }
+# return(.getDomainMapping(eval(l), s)$map)
+#}
+#
+#ifelse <- function(test,yes, no) if (inherits(test, "H2OParsedData") ||
+#                                     inherits(no, "H2OParsedData")    ||
+#                                     inherits(yes, "H2oParsedData")) UseMethod("ifelse") else base::ifelse(test, yes, no)
+#
+#ifelse.H2OParsedData <- function(test, yes, no) {
+#  if (is.character(yes)) yes <- .getDomainMapping2(as.list(substitute(test)), yes)
+#  if (is.character(no))  no  <- .getDomainMapping2(as.list(substitute(test)), no)
+#  h2o.exec(ifelse(test, yes, no))
+#}
+
 #setMethod("ifelse", signature(test="H2OParsedData", yes="ANY", no="ANY"), function(test, yes, no) {
 #  if(!(is.numeric(yes) || class(yes) == "H2OParsedData") || !(is.numeric(no) || class(no) == "H2OParsedData"))
 #    stop("Unimplemented")

diff --git a/R/h2o-package/R/Internal.R b/R/h2o-package/R/Internal.R
@@ -104,8 +104,10 @@ h2o.setLogPath <- function(path, type) {
 .h2o.__PAGE_VIEWALL = "StoreView.json"
 .h2o.__DOWNLOAD_LOGS = "LogDownload.json"
 .h2o.__DOMAIN_MAPPING = "2/DomainMapping.json"
+.h2o.__SET_DOMAIN = "2/SetDomains.json"
 .h2o.__PAGE_ALLMODELS = "2/Models.json"
 
+.h2o.__PAGE_IMPUTE= "2/Impute.json"
 .h2o.__PAGE_EXEC2 = "2/Exec2.json"
 .h2o.__PAGE_IMPORTFILES2 = "2/ImportFiles2.json"
 .h2o.__PAGE_EXPORTFILES = "2/ExportFiles.json"

diff --git a/R/tests/testdir_demos/runit_impute_demo.R b/R/tests/testdir_demos/runit_impute_demo.R
@@ -0,0 +1,63 @@
+setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
+source('../findNSourceUtils.R')
+
+# a useful function to make a quick copy of a data frame in H2O
+cp <- function(this) this[1:nrow(this), 1:ncol(this)]
+
+# a useful function to count number of NAs in a column
+numNAs <- function(col) sum(is.na(col))
+
+conn <- h2o.init()
+
+prostate.hex <- h2o.uploadFile(conn, "../../../smalldata/logreg/prostate_missing.csv", "prostate.hex")
+dim(prostate.hex)
+
+print("Summary of the data in iris_missing.csv")
+print("Each column has 50 missing observations (at random)")
+summary(prostate.hex)
+
+
+print("Make a copy of the original dataset to play with.")
+hex <- cp(prostate.hex)
+print(hex@key)
+print(prostate.hex@key)
+print(prostate.hex)
+print(hex)
+
+
+print("Impute a numeric column with the mean")
+nas <- numNAs(hex[,"DPROS"])
+print(paste("NAs before imputation:", nas))
+h2o.impute(hex, .(DPROS), method = "mean")
+
+nas <- numNAs(hex[,"DPROS"])
+print(paste("NAs after imputation: ", nas))
+
+
+
+# OTHER POSSIBLE SYNTAXES ALLOWED:
+hex <- cp(prostate.hex)
+h2o.impute(hex, 8, method = "mean")
+
+hex <- cp(prostate.hex)
+h2o.impute(hex, c("VOL"), method = "mean")
+
+hex <- cp(prostate.hex)
+h2o.impute(hex, "VOL", method = "mean")
+
+# USING  MEDIAN
+print("Impute a numeric column with the median")
+
+hex <- cp(prostate.hex)
+h2o.impute(hex, .(VOL), method = "median")
+
+hex <- cp(prostate.hex)
+h2o.impute(hex, 8, method = "median")
+
+hex <- cp(prostate.hex)
+h2o.impute(hex, c("VOL"), method = "median")
+
+hex <- cp(prostate.hex)
+h2o.impute(hex, "VOL", method = "median")
+
+testEnd()
diff --git a/R/tests/testdir_munging/impute/runit_impute_basic.R b/R/tests/testdir_munging/impute/runit_impute_basic.R
@@ -0,0 +1,84 @@
+setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
+source('../../findNSourceUtils.R')
+
+cp <- function(this) this[1:nrow(this), 1:ncol(this)]
+
+test.eq2.h2o.assign<-
+function(conn) {
+    iris.hex <- h2o.uploadFile(conn, locate("smalldata/iris/iris_missing.csv"), "iris.hex")
+    dim(iris.hex)
+
+    Log.info("Summary of the data in iris_missing.csv")
+    Log.info("Each column has 50 missing observations (at random)")
+    summary(iris.hex)
+
+
+    Log.info("Make a copy of the original dataset to play with.")
+    hex <- cp(iris.hex)
+    print(hex@key)
+    print(iris.hex@key)
+    print(iris.hex)
+    print(hex)
+
+    Log.info("Impute a numeric column with the mean")
+    h2o.impute(hex, .(Sepal.Length), method = "mean")
+    expect_that(sum(is.na(hex[,"Sepal.Length"])), equals(0))
+    hex <- cp(iris.hex)
+    h2o.impute(hex, 1, method = "mean")
+    expect_that(sum(is.na(hex[,"Sepal.Length"])), equals(0))
+    hex <- cp(iris.hex)
+    h2o.impute(hex, c("Sepal.Length"), method = "mean")
+    expect_that(sum(is.na(hex[,"Sepal.Length"])), equals(0))
+    hex <- cp(iris.hex)
+    h2o.impute(hex, "Sepal.Length", method = "mean")
+    expect_that(sum(is.na(hex[,"Sepal.Length"])), equals(0))
+    hex <- cp(iris.hex)
+
+    Log.info("Impute a numeric column with the median")
+    h2o.impute(hex, .(Sepal.Length), method = "median")
+    expect_that(sum(is.na(hex[,"Sepal.Length"])), equals(0))
+    hex <- cp(iris.hex)
+    h2o.impute(hex, 1, method = "median")
+    expect_that(sum(is.na(hex[,"Sepal.Length"])), equals(0))
+    hex <- cp(iris.hex)
+    h2o.impute(hex, c("Sepal.Length"), method = "median")
+    expect_that(sum(is.na(hex[,"Sepal.Length"])), equals(0))
+    hex <- cp(iris.hex)
+    h2o.impute(hex, "Sepal.Length", method = "median")
+    expect_that(sum(is.na(hex[,"Sepal.Length"])), equals(0))
+    hex <- cp(iris.hex)
+
+    Log.info("Impute a factor column (uses the mode)")
+    h2o.impute(hex, .(Species), method = "mode")
+    expect_that(sum(is.na(hex[,"Species"])), equals(0))
+    hex <- cp(iris.hex)
+    h2o.impute(hex, 5, method = "mode")
+    expect_that(sum(is.na(hex[,"Species"])), equals(0))
+    hex <- cp(iris.hex)
+    h2o.impute(hex, c("Species"), method = "mode")
+    expect_that(sum(is.na(hex[,"Species"])), equals(0))
+    hex <- cp(iris.hex)
+    h2o.impute(hex, "Species", method = "mode")
+    expect_that(sum(is.na(hex[,"Species"])), equals(0))
+    hex <- cp(iris.hex)
+
+    Log.info("Now check that imputing with column groupings works...")
+    h2o.impute(hex, .(Sepal.Length), method = "mean", groupBy = c("Sepal.Width", "Petal.Width"))
+    # possibly some NAs still present in the column, because of NAs in the groupBy columns
+    print(hex)
+    hex <- cp(iris.hex)
+    h2o.impute(hex, 1, method = "median", groupBy = c("Species", "Petal.Width", "Petal.Length"))
+    print(hex)
+    hex <- cp(iris.hex)
+    h2o.impute(hex, "Petal.Width", method = "mean", groupBy = c(1,2,5))
+    print(hex)
+    hex <- cp(iris.hex)
+    h2o.impute(hex, "Species", method = "mode", groupBy = c(1,3,4))
+    print(hex)
+
+
+    testEnd()
+}
+
+doTest("Test h2o.assign(data,key)", test.eq2.h2o.assign)
+