@@ -519,7 +519,16 @@ h2o.table <- function(x, return.in.R = FALSE) {
519
519
return (tb )
520
520
}
521
521
522
- h2o.ddply <- function (.data , .variables , .fun = NULL , ... , .progress = ' none' ) {
522
+ ddply <- function (.data , .variables , .fun = NULL , ... , .progress = " none" ,
523
+ .inform = FALSE , .drop = TRUE , .parallel = FALSE , .paropts = NULL ) {
524
+ if (inherits(.data , " H2OParsedData" )) UseMethod(" ddply" )
525
+ else plyr :: ddply(.data , .variables , .fun , ... , .progress , .inform , .drop , .parallel , .paraopts ) }
526
+
527
+ ddply.H2OParsedData <- function (.data , .variables , .fun = NULL , ... , .progress = " none" ,
528
+ .inform = FALSE , .drop = TRUE , .parallel = FALSE , .paropts = NULL ) {
529
+
530
+ # .inform, .drop, .parallel, .paropts are all ignored inputs.
531
+
523
532
if (missing(.data )) stop(' must specify .data' )
524
533
if (class(.data ) != " H2OParsedData" ) stop(' .data must be an H2OParsedData object' )
525
534
if ( missing(.variables ) ) stop(' must specify .variables' )
@@ -560,7 +569,6 @@ h2o.ddply <- function (.data, .variables, .fun = NULL, ..., .progress = 'none')
560
569
res <- .h2o.__exec2(.data @ h2o , exec_cmd )
561
570
.h2o.exec2(res $ dest_key , h2o = .data @ h2o , res $ dest_key )
562
571
}
563
- ddply <- h2o.ddply
564
572
565
573
# TODO: how to avoid masking plyr?
566
574
`h2o..` <- function (... ) {
@@ -571,6 +579,81 @@ ddply <- h2o.ddply
571
579
572
580
`.` <- `h2o..`
573
581
582
+ # '
583
+ # ' Impute Missing Values
584
+ # '
585
+ # ' Impute the missing values in the data `column` belonging to the dataset `data`.
586
+ # '
587
+ # ' Possible values for `method`: "mean", "median", "reg", "RF"
588
+ # '
589
+ # ' If `groupBy` is NULL, then for `mean` and `median`, missing values are imputed using the column mean/median.
590
+ # ' For `reg` and `RF`, all columns except for `column` are used in the regression/RF fit.
591
+ # '
592
+ # ' If `groupBy` is not NULL, then for `mean` and `median`, the missing values are imputed using the mean/median of
593
+ # ' `column` within the groups formed by the groupBy columns.
594
+ # ' For `reg` and `RF`, the groupBy variables are the input variables to the regression/RF fit.
595
+ # '
596
+ # ' If the column is non-numeric and the method selected is "reg", an error will be produced.
597
+ h2o.impute <- function (data , column , method = " mean" , groupBy = NULL ) {
598
+ # possible methods: "mean", "median", "reg", "RF"
599
+ # what happens when a grouping has only NA values ? -> default to "method" for the unimputed column.
600
+ stopifnot(! missing(data ))
601
+ stopifnot(! missing(column ))
602
+ stopifnot(method %in% c(" mean" , " median" , " mode" ))
603
+ # if (!is.null(groupBy)) stopifnot(any(groupBy <= 0))
604
+ stopifnot(inherits(data , " H2OParsedData" ))
605
+
606
+ .data <- data
607
+ .variables <- groupBy
608
+ idx <- NULL
609
+ if (! is.null(.variables )) {
610
+ # we accept eg .(col1, col2), c('col1', 'col2'), 1:2, c(1,2)
611
+ # as column names. This is a bit complicated
612
+ if ( class(.variables ) == ' character' ){
613
+ vars <- .variables
614
+ idx <- match(vars , colnames(.data ))
615
+ } else if ( class(.variables ) == ' H2Oquoted' ){
616
+ vars <- as.character(.variables )
617
+ idx <- match(vars , colnames(.data ))
618
+ } else if ( class(.variables ) == ' quoted' ){ # plyr overwrote our . fn
619
+ vars <- names(.variables )
620
+ idx <- match(vars , colnames(.data ))
621
+ } else if ( class(.variables ) == ' integer' ){
622
+ vars <- .variables
623
+ idx <- .variables
624
+ } else if ( class(.variables ) == ' numeric' ){ # this will happen eg c(1,2,3)
625
+ vars <- .variables
626
+ idx <- as.integer(.variables )
627
+ }
628
+ bad <- is.na(idx ) | idx < 1 | idx > ncol(.data )
629
+ if ( any(bad ) ) stop( sprintf(' can\' t recognize .variables %s' , paste(vars [bad ], sep = ' ,' )) )
630
+ idx <- idx - 1
631
+ }
632
+
633
+ col_idx <- NULL
634
+ if ( class(column ) == ' character' ){
635
+ vars <- column
636
+ col_idx <- match(vars , colnames(.data ))
637
+ } else if ( class(column ) == ' H2Oquoted' ){
638
+ vars <- as.character(column )
639
+ col_idx <- match(vars , colnames(.data ))
640
+ } else if ( class(column ) == ' quoted' ){ # plyr overwrote our . fn
641
+ vars <- names(column )
642
+ col_idx <- match(vars , colnames(.data ))
643
+ } else if ( class(column ) == ' integer' ){
644
+ vars <- column
645
+ col_idx <- column
646
+ } else if ( class(column ) == ' numeric' ){ # this will happen eg c(1,2,3)
647
+ vars <- column
648
+ col_idx <- as.integer(column )
649
+ }
650
+ bad <- is.na(col_idx ) | col_idx < 1 | col_idx > ncol(.data )
651
+ if ( any(bad ) ) stop( sprintf(' can\' t recognize column %s' , paste(vars [bad ], sep = ' ,' )) )
652
+ if (length(col_idx ) > 1 ) stop(" Only allows imputation of a single column at a time!" )
653
+ # x@h2o, .h2o.__HACK_SETCOLNAMES2, source=x@key, cols=numCols, comma_separated_list=name)
654
+ invisible (.h2o.__remoteSend(data @ h2o , .h2o.__PAGE_IMPUTE , source = data @ key , column = col_idx - 1 , method = method , group_by = idx ))
655
+ }
656
+
574
657
h2o.addFunction <- function (object , fun , name ){
575
658
if ( missing(object ) || class(object ) != ' H2OClient' ) stop(' must specify h2o connection in object' )
576
659
if ( missing(fun ) ) stop(' must specify fun' )
@@ -960,8 +1043,12 @@ setMethod("floor", "H2OParsedData", function(x) { .h2o.__unop2("floor", x) })
960
1043
setMethod ("trunc ", "H2OParsedData", function(x) { .h2o.__unop2("trunc", x) })
961
1044
setMethod ("log ", "H2OParsedData", function(x) { .h2o.__unop2("log", x) })
962
1045
setMethod ("exp ", "H2OParsedData", function(x) { .h2o.__unop2("exp", x) })
963
- setMethod ("is.na ", "H2OParsedData", function(x) { .h2o.__unop2("is.na", x) })
1046
+ setMethod ("is.na ", "H2OParsedData", function(x) {
1047
+ res <- .h2o.__unop2(" is.na" , x )
1048
+ # res <- as.numeric(res)
1049
+ })
964
1050
setMethod ("t ", "H2OParsedData", function(x) { .h2o.__unop2("t", x) })
1051
+ setMethod ("as.numeric ", "H2OParsedData", function(x) { .h2o.__unop2("as.numeric", x) })
965
1052
966
1053
round.H2OParsedData <- function (x , digits = 0 ) {
967
1054
if (length(digits ) > 1 || ! is.numeric(digits )) stop(" digits must be a single number" )
@@ -1367,6 +1454,23 @@ function (test, yes, no)
1367
1454
ans
1368
1455
}
1369
1456
1457
+ # .getDomainMapping2 <- function(l, s = "") {
1458
+ # if (is.list(l)) {
1459
+ # return( .getDomainMapping2( l[[length(l)]], s))
1460
+ # }
1461
+ # return(.getDomainMapping(eval(l), s)$map)
1462
+ # }
1463
+ #
1464
+ # ifelse <- function(test,yes, no) if (inherits(test, "H2OParsedData") ||
1465
+ # inherits(no, "H2OParsedData") ||
1466
+ # inherits(yes, "H2oParsedData")) UseMethod("ifelse") else base::ifelse(test, yes, no)
1467
+ #
1468
+ # ifelse.H2OParsedData <- function(test, yes, no) {
1469
+ # if (is.character(yes)) yes <- .getDomainMapping2(as.list(substitute(test)), yes)
1470
+ # if (is.character(no)) no <- .getDomainMapping2(as.list(substitute(test)), no)
1471
+ # h2o.exec(ifelse(test, yes, no))
1472
+ # }
1473
+
1370
1474
# setMethod("ifelse", signature(test="H2OParsedData", yes="ANY", no="ANY"), function(test, yes, no) {
1371
1475
# if(!(is.numeric(yes) || class(yes) == "H2OParsedData") || !(is.numeric(no) || class(no) == "H2OParsedData"))
1372
1476
# stop("Unimplemented")
0 commit comments