Skip to content

Commit

Permalink
h2o.impute commit
Browse files Browse the repository at this point in the history
  • Loading branch information
spennihana committed Oct 2, 2014
1 parent bead377 commit 87127a8
Show file tree
Hide file tree
Showing 14 changed files with 1,658 additions and 550 deletions.
110 changes: 107 additions & 3 deletions R/h2o-package/R/Classes.R
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,16 @@ h2o.table <- function(x, return.in.R = FALSE) {
return(tb)
}

h2o.ddply <- function (.data, .variables, .fun = NULL, ..., .progress = 'none') {
ddply <- function (.data, .variables, .fun = NULL, ..., .progress = "none",
.inform = FALSE, .drop = TRUE, .parallel = FALSE, .paropts = NULL) {
if (inherits(.data, "H2OParsedData")) UseMethod("ddply")
else plyr::ddply(.data, .variables, .fun, ..., .progress, .inform, .drop, .parallel, .paraopts) }

ddply.H2OParsedData <- function (.data, .variables, .fun = NULL, ..., .progress = "none",
.inform = FALSE, .drop = TRUE, .parallel = FALSE, .paropts = NULL) {

# .inform, .drop, .parallel, .paropts are all ignored inputs.

if(missing(.data)) stop('must specify .data')
if(class(.data) != "H2OParsedData") stop('.data must be an H2OParsedData object')
if( missing(.variables) ) stop('must specify .variables')
Expand Down Expand Up @@ -560,7 +569,6 @@ h2o.ddply <- function (.data, .variables, .fun = NULL, ..., .progress = 'none')
res <- .h2o.__exec2(.data@h2o, exec_cmd)
.h2o.exec2(res$dest_key, h2o = .data@h2o, res$dest_key)
}
ddply <- h2o.ddply

# TODO: how to avoid masking plyr?
`h2o..` <- function(...) {
Expand All @@ -571,6 +579,81 @@ ddply <- h2o.ddply

`.` <- `h2o..`

#'
#' Impute Missing Values
#'
#' Impute the missing values in the data `column` belonging to the dataset `data`.
#'
#' Possible values for `method`: "mean", "median", "reg", "RF"
#'
#' If `groupBy` is NULL, then for `mean` and `median`, missing values are imputed using the column mean/median.
#' For `reg` and `RF`, all columns except for `column` are used in the regression/RF fit.
#'
#' If `groupBy` is not NULL, then for `mean` and `median`, the missing values are imputed using the mean/median of
#' `column` within the groups formed by the groupBy columns.
#' For `reg` and `RF`, the groupBy variables are the input variables to the regression/RF fit.
#'
#' If the column is non-numeric and the method selected is "reg", an error will be produced.
h2o.impute <- function(data, column, method = "mean", groupBy = NULL) {
# possible methods: "mean", "median", "reg", "RF"
# what happens when a grouping has only NA values ? -> default to "method" for the unimputed column.
stopifnot(!missing(data))
stopifnot(!missing(column))
stopifnot(method %in% c("mean", "median", "mode"))
# if (!is.null(groupBy)) stopifnot(any(groupBy <= 0))
stopifnot(inherits(data, "H2OParsedData"))

.data <- data
.variables <- groupBy
idx <- NULL
if (!is.null(.variables)) {
# we accept eg .(col1, col2), c('col1', 'col2'), 1:2, c(1,2)
# as column names. This is a bit complicated
if( class(.variables) == 'character'){
vars <- .variables
idx <- match(vars, colnames(.data))
} else if( class(.variables) == 'H2Oquoted' ){
vars <- as.character(.variables)
idx <- match(vars, colnames(.data))
} else if( class(.variables) == 'quoted' ){ # plyr overwrote our . fn
vars <- names(.variables)
idx <- match(vars, colnames(.data))
} else if( class(.variables) == 'integer' ){
vars <- .variables
idx <- .variables
} else if( class(.variables) == 'numeric' ){ # this will happen eg c(1,2,3)
vars <- .variables
idx <- as.integer(.variables)
}
bad <- is.na(idx) | idx < 1 | idx > ncol(.data)
if( any(bad) ) stop( sprintf('can\'t recognize .variables %s', paste(vars[bad], sep=',')) )
idx <- idx - 1
}

col_idx <- NULL
if( class(column) == 'character'){
vars <- column
col_idx <- match(vars, colnames(.data))
} else if( class(column) == 'H2Oquoted' ){
vars <- as.character(column)
col_idx <- match(vars, colnames(.data))
} else if( class(column) == 'quoted' ){ # plyr overwrote our . fn
vars <- names(column)
col_idx <- match(vars, colnames(.data))
} else if( class(column) == 'integer' ){
vars <- column
col_idx <- column
} else if( class(column) == 'numeric' ){ # this will happen eg c(1,2,3)
vars <- column
col_idx <- as.integer(column)
}
bad <- is.na(col_idx) | col_idx < 1 | col_idx > ncol(.data)
if( any(bad) ) stop( sprintf('can\'t recognize column %s', paste(vars[bad], sep=',')) )
if (length(col_idx) > 1) stop("Only allows imputation of a single column at a time!")
#x@h2o, .h2o.__HACK_SETCOLNAMES2, source=x@key, cols=numCols, comma_separated_list=name)
invisible(.h2o.__remoteSend(data@h2o, .h2o.__PAGE_IMPUTE, source=data@key, column=col_idx-1, method=method, group_by=idx))
}

h2o.addFunction <- function(object, fun, name){
if( missing(object) || class(object) != 'H2OClient' ) stop('must specify h2o connection in object')
if( missing(fun) ) stop('must specify fun')
Expand Down Expand Up @@ -960,8 +1043,12 @@ setMethod("floor", "H2OParsedData", function(x) { .h2o.__unop2("floor", x) })
setMethod("trunc", "H2OParsedData", function(x) { .h2o.__unop2("trunc", x) })
setMethod("log", "H2OParsedData", function(x) { .h2o.__unop2("log", x) })
setMethod("exp", "H2OParsedData", function(x) { .h2o.__unop2("exp", x) })
setMethod("is.na", "H2OParsedData", function(x) { .h2o.__unop2("is.na", x) })
setMethod("is.na", "H2OParsedData", function(x) {
res <- .h2o.__unop2("is.na", x)
# res <- as.numeric(res)
})
setMethod("t", "H2OParsedData", function(x) { .h2o.__unop2("t", x) })
setMethod("as.numeric", "H2OParsedData", function(x) { .h2o.__unop2("as.numeric", x) })

round.H2OParsedData <- function(x, digits = 0) {
if(length(digits) > 1 || !is.numeric(digits)) stop("digits must be a single number")
Expand Down Expand Up @@ -1367,6 +1454,23 @@ function (test, yes, no)
ans
}

#.getDomainMapping2 <- function(l, s = "") {
# if (is.list(l)) {
# return( .getDomainMapping2( l[[length(l)]], s))
# }
# return(.getDomainMapping(eval(l), s)$map)
#}
#
#ifelse <- function(test,yes, no) if (inherits(test, "H2OParsedData") ||
# inherits(no, "H2OParsedData") ||
# inherits(yes, "H2oParsedData")) UseMethod("ifelse") else base::ifelse(test, yes, no)
#
#ifelse.H2OParsedData <- function(test, yes, no) {
# if (is.character(yes)) yes <- .getDomainMapping2(as.list(substitute(test)), yes)
# if (is.character(no)) no <- .getDomainMapping2(as.list(substitute(test)), no)
# h2o.exec(ifelse(test, yes, no))
#}

#setMethod("ifelse", signature(test="H2OParsedData", yes="ANY", no="ANY"), function(test, yes, no) {
# if(!(is.numeric(yes) || class(yes) == "H2OParsedData") || !(is.numeric(no) || class(no) == "H2OParsedData"))
# stop("Unimplemented")
Expand Down
2 changes: 2 additions & 0 deletions R/h2o-package/R/Internal.R
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,10 @@ h2o.setLogPath <- function(path, type) {
.h2o.__PAGE_VIEWALL = "StoreView.json"
.h2o.__DOWNLOAD_LOGS = "LogDownload.json"
.h2o.__DOMAIN_MAPPING = "2/DomainMapping.json"
.h2o.__SET_DOMAIN = "2/SetDomains.json"
.h2o.__PAGE_ALLMODELS = "2/Models.json"

.h2o.__PAGE_IMPUTE= "2/Impute.json"
.h2o.__PAGE_EXEC2 = "2/Exec2.json"
.h2o.__PAGE_IMPORTFILES2 = "2/ImportFiles2.json"
.h2o.__PAGE_EXPORTFILES = "2/ExportFiles.json"
Expand Down
63 changes: 63 additions & 0 deletions R/tests/testdir_demos/runit_impute_demo.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source('../findNSourceUtils.R')

# a useful function to make a quick copy of a data frame in H2O
cp <- function(this) this[1:nrow(this), 1:ncol(this)]

# a useful function to count number of NAs in a column
numNAs <- function(col) sum(is.na(col))

conn <- h2o.init()

prostate.hex <- h2o.uploadFile(conn, "../../../smalldata/logreg/prostate_missing.csv", "prostate.hex")
dim(prostate.hex)

print("Summary of the data in iris_missing.csv")
print("Each column has 50 missing observations (at random)")
summary(prostate.hex)


print("Make a copy of the original dataset to play with.")
hex <- cp(prostate.hex)
print(hex@key)
print(prostate.hex@key)
print(prostate.hex)
print(hex)


print("Impute a numeric column with the mean")
nas <- numNAs(hex[,"DPROS"])
print(paste("NAs before imputation:", nas))
h2o.impute(hex, .(DPROS), method = "mean")

nas <- numNAs(hex[,"DPROS"])
print(paste("NAs after imputation: ", nas))



# OTHER POSSIBLE SYNTAXES ALLOWED:
hex <- cp(prostate.hex)
h2o.impute(hex, 8, method = "mean")

hex <- cp(prostate.hex)
h2o.impute(hex, c("VOL"), method = "mean")

hex <- cp(prostate.hex)
h2o.impute(hex, "VOL", method = "mean")

# USING MEDIAN
print("Impute a numeric column with the median")

hex <- cp(prostate.hex)
h2o.impute(hex, .(VOL), method = "median")

hex <- cp(prostate.hex)
h2o.impute(hex, 8, method = "median")

hex <- cp(prostate.hex)
h2o.impute(hex, c("VOL"), method = "median")

hex <- cp(prostate.hex)
h2o.impute(hex, "VOL", method = "median")

testEnd()
84 changes: 84 additions & 0 deletions R/tests/testdir_munging/impute/runit_impute_basic.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source('../../findNSourceUtils.R')

cp <- function(this) this[1:nrow(this), 1:ncol(this)]

test.eq2.h2o.assign<-
function(conn) {
iris.hex <- h2o.uploadFile(conn, locate("smalldata/iris/iris_missing.csv"), "iris.hex")
dim(iris.hex)

Log.info("Summary of the data in iris_missing.csv")
Log.info("Each column has 50 missing observations (at random)")
summary(iris.hex)


Log.info("Make a copy of the original dataset to play with.")
hex <- cp(iris.hex)
print(hex@key)
print(iris.hex@key)
print(iris.hex)
print(hex)

Log.info("Impute a numeric column with the mean")
h2o.impute(hex, .(Sepal.Length), method = "mean")
expect_that(sum(is.na(hex[,"Sepal.Length"])), equals(0))
hex <- cp(iris.hex)
h2o.impute(hex, 1, method = "mean")
expect_that(sum(is.na(hex[,"Sepal.Length"])), equals(0))
hex <- cp(iris.hex)
h2o.impute(hex, c("Sepal.Length"), method = "mean")
expect_that(sum(is.na(hex[,"Sepal.Length"])), equals(0))
hex <- cp(iris.hex)
h2o.impute(hex, "Sepal.Length", method = "mean")
expect_that(sum(is.na(hex[,"Sepal.Length"])), equals(0))
hex <- cp(iris.hex)

Log.info("Impute a numeric column with the median")
h2o.impute(hex, .(Sepal.Length), method = "median")
expect_that(sum(is.na(hex[,"Sepal.Length"])), equals(0))
hex <- cp(iris.hex)
h2o.impute(hex, 1, method = "median")
expect_that(sum(is.na(hex[,"Sepal.Length"])), equals(0))
hex <- cp(iris.hex)
h2o.impute(hex, c("Sepal.Length"), method = "median")
expect_that(sum(is.na(hex[,"Sepal.Length"])), equals(0))
hex <- cp(iris.hex)
h2o.impute(hex, "Sepal.Length", method = "median")
expect_that(sum(is.na(hex[,"Sepal.Length"])), equals(0))
hex <- cp(iris.hex)

Log.info("Impute a factor column (uses the mode)")
h2o.impute(hex, .(Species), method = "mode")
expect_that(sum(is.na(hex[,"Species"])), equals(0))
hex <- cp(iris.hex)
h2o.impute(hex, 5, method = "mode")
expect_that(sum(is.na(hex[,"Species"])), equals(0))
hex <- cp(iris.hex)
h2o.impute(hex, c("Species"), method = "mode")
expect_that(sum(is.na(hex[,"Species"])), equals(0))
hex <- cp(iris.hex)
h2o.impute(hex, "Species", method = "mode")
expect_that(sum(is.na(hex[,"Species"])), equals(0))
hex <- cp(iris.hex)

Log.info("Now check that imputing with column groupings works...")
h2o.impute(hex, .(Sepal.Length), method = "mean", groupBy = c("Sepal.Width", "Petal.Width"))
# possibly some NAs still present in the column, because of NAs in the groupBy columns
print(hex)
hex <- cp(iris.hex)
h2o.impute(hex, 1, method = "median", groupBy = c("Species", "Petal.Width", "Petal.Length"))
print(hex)
hex <- cp(iris.hex)
h2o.impute(hex, "Petal.Width", method = "mean", groupBy = c(1,2,5))
print(hex)
hex <- cp(iris.hex)
h2o.impute(hex, "Species", method = "mode", groupBy = c(1,3,4))
print(hex)


testEnd()
}

doTest("Test h2o.assign(data,key)", test.eq2.h2o.assign)

Loading

0 comments on commit 87127a8

Please sign in to comment.