Skip to content

Commit 87127a8

Browse files
committed
h2o.impute commit
1 parent bead377 commit 87127a8

File tree

14 files changed

+1658
-550
lines changed

14 files changed

+1658
-550
lines changed

R/h2o-package/R/Classes.R

+107-3
Original file line numberDiff line numberDiff line change
@@ -519,7 +519,16 @@ h2o.table <- function(x, return.in.R = FALSE) {
519519
return(tb)
520520
}
521521

522-
h2o.ddply <- function (.data, .variables, .fun = NULL, ..., .progress = 'none') {
522+
ddply <- function (.data, .variables, .fun = NULL, ..., .progress = "none",
523+
.inform = FALSE, .drop = TRUE, .parallel = FALSE, .paropts = NULL) {
524+
if (inherits(.data, "H2OParsedData")) UseMethod("ddply")
525+
else plyr::ddply(.data, .variables, .fun, ..., .progress, .inform, .drop, .parallel, .paraopts) }
526+
527+
ddply.H2OParsedData <- function (.data, .variables, .fun = NULL, ..., .progress = "none",
528+
.inform = FALSE, .drop = TRUE, .parallel = FALSE, .paropts = NULL) {
529+
530+
# .inform, .drop, .parallel, .paropts are all ignored inputs.
531+
523532
if(missing(.data)) stop('must specify .data')
524533
if(class(.data) != "H2OParsedData") stop('.data must be an H2OParsedData object')
525534
if( missing(.variables) ) stop('must specify .variables')
@@ -560,7 +569,6 @@ h2o.ddply <- function (.data, .variables, .fun = NULL, ..., .progress = 'none')
560569
res <- .h2o.__exec2(.data@h2o, exec_cmd)
561570
.h2o.exec2(res$dest_key, h2o = .data@h2o, res$dest_key)
562571
}
563-
ddply <- h2o.ddply
564572

565573
# TODO: how to avoid masking plyr?
566574
`h2o..` <- function(...) {
@@ -571,6 +579,81 @@ ddply <- h2o.ddply
571579

572580
`.` <- `h2o..`
573581

582+
#'
583+
#' Impute Missing Values
584+
#'
585+
#' Impute the missing values in the data `column` belonging to the dataset `data`.
586+
#'
587+
#' Possible values for `method`: "mean", "median", "reg", "RF"
588+
#'
589+
#' If `groupBy` is NULL, then for `mean` and `median`, missing values are imputed using the column mean/median.
590+
#' For `reg` and `RF`, all columns except for `column` are used in the regression/RF fit.
591+
#'
592+
#' If `groupBy` is not NULL, then for `mean` and `median`, the missing values are imputed using the mean/median of
593+
#' `column` within the groups formed by the groupBy columns.
594+
#' For `reg` and `RF`, the groupBy variables are the input variables to the regression/RF fit.
595+
#'
596+
#' If the column is non-numeric and the method selected is "reg", an error will be produced.
597+
h2o.impute <- function(data, column, method = "mean", groupBy = NULL) {
598+
# possible methods: "mean", "median", "reg", "RF"
599+
# what happens when a grouping has only NA values ? -> default to "method" for the unimputed column.
600+
stopifnot(!missing(data))
601+
stopifnot(!missing(column))
602+
stopifnot(method %in% c("mean", "median", "mode"))
603+
# if (!is.null(groupBy)) stopifnot(any(groupBy <= 0))
604+
stopifnot(inherits(data, "H2OParsedData"))
605+
606+
.data <- data
607+
.variables <- groupBy
608+
idx <- NULL
609+
if (!is.null(.variables)) {
610+
# we accept eg .(col1, col2), c('col1', 'col2'), 1:2, c(1,2)
611+
# as column names. This is a bit complicated
612+
if( class(.variables) == 'character'){
613+
vars <- .variables
614+
idx <- match(vars, colnames(.data))
615+
} else if( class(.variables) == 'H2Oquoted' ){
616+
vars <- as.character(.variables)
617+
idx <- match(vars, colnames(.data))
618+
} else if( class(.variables) == 'quoted' ){ # plyr overwrote our . fn
619+
vars <- names(.variables)
620+
idx <- match(vars, colnames(.data))
621+
} else if( class(.variables) == 'integer' ){
622+
vars <- .variables
623+
idx <- .variables
624+
} else if( class(.variables) == 'numeric' ){ # this will happen eg c(1,2,3)
625+
vars <- .variables
626+
idx <- as.integer(.variables)
627+
}
628+
bad <- is.na(idx) | idx < 1 | idx > ncol(.data)
629+
if( any(bad) ) stop( sprintf('can\'t recognize .variables %s', paste(vars[bad], sep=',')) )
630+
idx <- idx - 1
631+
}
632+
633+
col_idx <- NULL
634+
if( class(column) == 'character'){
635+
vars <- column
636+
col_idx <- match(vars, colnames(.data))
637+
} else if( class(column) == 'H2Oquoted' ){
638+
vars <- as.character(column)
639+
col_idx <- match(vars, colnames(.data))
640+
} else if( class(column) == 'quoted' ){ # plyr overwrote our . fn
641+
vars <- names(column)
642+
col_idx <- match(vars, colnames(.data))
643+
} else if( class(column) == 'integer' ){
644+
vars <- column
645+
col_idx <- column
646+
} else if( class(column) == 'numeric' ){ # this will happen eg c(1,2,3)
647+
vars <- column
648+
col_idx <- as.integer(column)
649+
}
650+
bad <- is.na(col_idx) | col_idx < 1 | col_idx > ncol(.data)
651+
if( any(bad) ) stop( sprintf('can\'t recognize column %s', paste(vars[bad], sep=',')) )
652+
if (length(col_idx) > 1) stop("Only allows imputation of a single column at a time!")
653+
#x@h2o, .h2o.__HACK_SETCOLNAMES2, source=x@key, cols=numCols, comma_separated_list=name)
654+
invisible(.h2o.__remoteSend(data@h2o, .h2o.__PAGE_IMPUTE, source=data@key, column=col_idx-1, method=method, group_by=idx))
655+
}
656+
574657
h2o.addFunction <- function(object, fun, name){
575658
if( missing(object) || class(object) != 'H2OClient' ) stop('must specify h2o connection in object')
576659
if( missing(fun) ) stop('must specify fun')
@@ -960,8 +1043,12 @@ setMethod("floor", "H2OParsedData", function(x) { .h2o.__unop2("floor", x) })
9601043
setMethod("trunc", "H2OParsedData", function(x) { .h2o.__unop2("trunc", x) })
9611044
setMethod("log", "H2OParsedData", function(x) { .h2o.__unop2("log", x) })
9621045
setMethod("exp", "H2OParsedData", function(x) { .h2o.__unop2("exp", x) })
963-
setMethod("is.na", "H2OParsedData", function(x) { .h2o.__unop2("is.na", x) })
1046+
setMethod("is.na", "H2OParsedData", function(x) {
1047+
res <- .h2o.__unop2("is.na", x)
1048+
# res <- as.numeric(res)
1049+
})
9641050
setMethod("t", "H2OParsedData", function(x) { .h2o.__unop2("t", x) })
1051+
setMethod("as.numeric", "H2OParsedData", function(x) { .h2o.__unop2("as.numeric", x) })
9651052

9661053
round.H2OParsedData <- function(x, digits = 0) {
9671054
if(length(digits) > 1 || !is.numeric(digits)) stop("digits must be a single number")
@@ -1367,6 +1454,23 @@ function (test, yes, no)
13671454
ans
13681455
}
13691456

1457+
#.getDomainMapping2 <- function(l, s = "") {
1458+
# if (is.list(l)) {
1459+
# return( .getDomainMapping2( l[[length(l)]], s))
1460+
# }
1461+
# return(.getDomainMapping(eval(l), s)$map)
1462+
#}
1463+
#
1464+
#ifelse <- function(test,yes, no) if (inherits(test, "H2OParsedData") ||
1465+
# inherits(no, "H2OParsedData") ||
1466+
# inherits(yes, "H2oParsedData")) UseMethod("ifelse") else base::ifelse(test, yes, no)
1467+
#
1468+
#ifelse.H2OParsedData <- function(test, yes, no) {
1469+
# if (is.character(yes)) yes <- .getDomainMapping2(as.list(substitute(test)), yes)
1470+
# if (is.character(no)) no <- .getDomainMapping2(as.list(substitute(test)), no)
1471+
# h2o.exec(ifelse(test, yes, no))
1472+
#}
1473+
13701474
#setMethod("ifelse", signature(test="H2OParsedData", yes="ANY", no="ANY"), function(test, yes, no) {
13711475
# if(!(is.numeric(yes) || class(yes) == "H2OParsedData") || !(is.numeric(no) || class(no) == "H2OParsedData"))
13721476
# stop("Unimplemented")

R/h2o-package/R/Internal.R

+2
Original file line numberDiff line numberDiff line change
@@ -104,8 +104,10 @@ h2o.setLogPath <- function(path, type) {
104104
.h2o.__PAGE_VIEWALL = "StoreView.json"
105105
.h2o.__DOWNLOAD_LOGS = "LogDownload.json"
106106
.h2o.__DOMAIN_MAPPING = "2/DomainMapping.json"
107+
.h2o.__SET_DOMAIN = "2/SetDomains.json"
107108
.h2o.__PAGE_ALLMODELS = "2/Models.json"
108109

110+
.h2o.__PAGE_IMPUTE= "2/Impute.json"
109111
.h2o.__PAGE_EXEC2 = "2/Exec2.json"
110112
.h2o.__PAGE_IMPORTFILES2 = "2/ImportFiles2.json"
111113
.h2o.__PAGE_EXPORTFILES = "2/ExportFiles.json"
+63
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
2+
source('../findNSourceUtils.R')
3+
4+
# a useful function to make a quick copy of a data frame in H2O
5+
cp <- function(this) this[1:nrow(this), 1:ncol(this)]
6+
7+
# a useful function to count number of NAs in a column
8+
numNAs <- function(col) sum(is.na(col))
9+
10+
conn <- h2o.init()
11+
12+
prostate.hex <- h2o.uploadFile(conn, "../../../smalldata/logreg/prostate_missing.csv", "prostate.hex")
13+
dim(prostate.hex)
14+
15+
print("Summary of the data in iris_missing.csv")
16+
print("Each column has 50 missing observations (at random)")
17+
summary(prostate.hex)
18+
19+
20+
print("Make a copy of the original dataset to play with.")
21+
hex <- cp(prostate.hex)
22+
print(hex@key)
23+
print(prostate.hex@key)
24+
print(prostate.hex)
25+
print(hex)
26+
27+
28+
print("Impute a numeric column with the mean")
29+
nas <- numNAs(hex[,"DPROS"])
30+
print(paste("NAs before imputation:", nas))
31+
h2o.impute(hex, .(DPROS), method = "mean")
32+
33+
nas <- numNAs(hex[,"DPROS"])
34+
print(paste("NAs after imputation: ", nas))
35+
36+
37+
38+
# OTHER POSSIBLE SYNTAXES ALLOWED:
39+
hex <- cp(prostate.hex)
40+
h2o.impute(hex, 8, method = "mean")
41+
42+
hex <- cp(prostate.hex)
43+
h2o.impute(hex, c("VOL"), method = "mean")
44+
45+
hex <- cp(prostate.hex)
46+
h2o.impute(hex, "VOL", method = "mean")
47+
48+
# USING MEDIAN
49+
print("Impute a numeric column with the median")
50+
51+
hex <- cp(prostate.hex)
52+
h2o.impute(hex, .(VOL), method = "median")
53+
54+
hex <- cp(prostate.hex)
55+
h2o.impute(hex, 8, method = "median")
56+
57+
hex <- cp(prostate.hex)
58+
h2o.impute(hex, c("VOL"), method = "median")
59+
60+
hex <- cp(prostate.hex)
61+
h2o.impute(hex, "VOL", method = "median")
62+
63+
testEnd()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
2+
source('../../findNSourceUtils.R')
3+
4+
cp <- function(this) this[1:nrow(this), 1:ncol(this)]
5+
6+
test.eq2.h2o.assign<-
7+
function(conn) {
8+
iris.hex <- h2o.uploadFile(conn, locate("smalldata/iris/iris_missing.csv"), "iris.hex")
9+
dim(iris.hex)
10+
11+
Log.info("Summary of the data in iris_missing.csv")
12+
Log.info("Each column has 50 missing observations (at random)")
13+
summary(iris.hex)
14+
15+
16+
Log.info("Make a copy of the original dataset to play with.")
17+
hex <- cp(iris.hex)
18+
print(hex@key)
19+
print(iris.hex@key)
20+
print(iris.hex)
21+
print(hex)
22+
23+
Log.info("Impute a numeric column with the mean")
24+
h2o.impute(hex, .(Sepal.Length), method = "mean")
25+
expect_that(sum(is.na(hex[,"Sepal.Length"])), equals(0))
26+
hex <- cp(iris.hex)
27+
h2o.impute(hex, 1, method = "mean")
28+
expect_that(sum(is.na(hex[,"Sepal.Length"])), equals(0))
29+
hex <- cp(iris.hex)
30+
h2o.impute(hex, c("Sepal.Length"), method = "mean")
31+
expect_that(sum(is.na(hex[,"Sepal.Length"])), equals(0))
32+
hex <- cp(iris.hex)
33+
h2o.impute(hex, "Sepal.Length", method = "mean")
34+
expect_that(sum(is.na(hex[,"Sepal.Length"])), equals(0))
35+
hex <- cp(iris.hex)
36+
37+
Log.info("Impute a numeric column with the median")
38+
h2o.impute(hex, .(Sepal.Length), method = "median")
39+
expect_that(sum(is.na(hex[,"Sepal.Length"])), equals(0))
40+
hex <- cp(iris.hex)
41+
h2o.impute(hex, 1, method = "median")
42+
expect_that(sum(is.na(hex[,"Sepal.Length"])), equals(0))
43+
hex <- cp(iris.hex)
44+
h2o.impute(hex, c("Sepal.Length"), method = "median")
45+
expect_that(sum(is.na(hex[,"Sepal.Length"])), equals(0))
46+
hex <- cp(iris.hex)
47+
h2o.impute(hex, "Sepal.Length", method = "median")
48+
expect_that(sum(is.na(hex[,"Sepal.Length"])), equals(0))
49+
hex <- cp(iris.hex)
50+
51+
Log.info("Impute a factor column (uses the mode)")
52+
h2o.impute(hex, .(Species), method = "mode")
53+
expect_that(sum(is.na(hex[,"Species"])), equals(0))
54+
hex <- cp(iris.hex)
55+
h2o.impute(hex, 5, method = "mode")
56+
expect_that(sum(is.na(hex[,"Species"])), equals(0))
57+
hex <- cp(iris.hex)
58+
h2o.impute(hex, c("Species"), method = "mode")
59+
expect_that(sum(is.na(hex[,"Species"])), equals(0))
60+
hex <- cp(iris.hex)
61+
h2o.impute(hex, "Species", method = "mode")
62+
expect_that(sum(is.na(hex[,"Species"])), equals(0))
63+
hex <- cp(iris.hex)
64+
65+
Log.info("Now check that imputing with column groupings works...")
66+
h2o.impute(hex, .(Sepal.Length), method = "mean", groupBy = c("Sepal.Width", "Petal.Width"))
67+
# possibly some NAs still present in the column, because of NAs in the groupBy columns
68+
print(hex)
69+
hex <- cp(iris.hex)
70+
h2o.impute(hex, 1, method = "median", groupBy = c("Species", "Petal.Width", "Petal.Length"))
71+
print(hex)
72+
hex <- cp(iris.hex)
73+
h2o.impute(hex, "Petal.Width", method = "mean", groupBy = c(1,2,5))
74+
print(hex)
75+
hex <- cp(iris.hex)
76+
h2o.impute(hex, "Species", method = "mode", groupBy = c(1,3,4))
77+
print(hex)
78+
79+
80+
testEnd()
81+
}
82+
83+
doTest("Test h2o.assign(data,key)", test.eq2.h2o.assign)
84+

0 commit comments

Comments
 (0)