Skip to content

Commit

Permalink
feat: imputation is compatible with HDF5Matrix objects
Browse files Browse the repository at this point in the history
  • Loading branch information
cvanderaa committed Oct 6, 2022
1 parent 6109107 commit 45f5524
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 9 deletions.
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

## MsCoreUtils 1.9.2

- feat: imputation is compatible with HDF5Matrix objects
- feat: normalization is compatible with HDF5Matrix objects
- feat: matrix aggregation is compatible with HDF5Matrix objects
- fix+feat: aggregate_by_matrix now correctly handles missing data and
Expand Down
29 changes: 24 additions & 5 deletions R/imputation.R
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
##' - *MLE*: Maximum likelihood-based imputation method using the EM
##' algorithm. Implemented in the `norm::imp.norm()`. function. See
##' [norm::imp.norm()] for details and additional parameters. Note
##' that here, `...` are passed to the [norm::em.norm()` function,
##' that here, `...` are passed to the [norm::em.norm()] function,
##' rather to the actual imputation function `imp.norm`.
##'
##' - *bpca*: Bayesian missing value imputation are available, as
Expand Down Expand Up @@ -183,24 +183,39 @@
##' mar = "knn",
##' mnar = "min")
##'
##' @param x A matrix with missing values to be imputed.
##' @param x A matrix or an `HDF5Matrix` object to be imputed.
##'
##' @param method `character(1)` defining the imputation method. See
##' `imputeMethods()` for available ones.
##'
##' @param ... Additional parameters passed to the inner imputation
##' function.
##'
##' @return A matrix of same class as `x` with dimensions `dim(x)`.
##'
##' @export
impute_matrix <- function(x,
method,
FUN,
...) {
## stopifnot(is(m, "matrix"))
if (!anyNA(x)) return(x)
## Handle HDF5Matrix
xIsHDF5 <- FALSE
if (inherits(x, "HDF5Array")) {
xIsHDF5 <- TRUE
p <- HDF5Array::path(x) ## stored for later writing to disk
## Watch out this can lead to memory burst when x is large.
x <- as.matrix(x)
}
## User-provided imputation function
if (!missing(FUN) && is.function(FUN))
return(impute_fun(x, FUN, ...))
if (!missing(FUN) && is.function(FUN)) {
res <- impute_fun(x, FUN, ...)
## Write to HDF5 file if the input is on HDF5 backend
if (xIsHDF5)
res <- HDF5Array::writeHDF5Array(res, filepath = p,
with.dimnames = TRUE)
return(res)
}
## Function name provided as a character
if (missing(method))
stop("Please specify an imputation method. ",
Expand Down Expand Up @@ -237,6 +252,10 @@ impute_matrix <- function(x,
res <- impute_RF(x, ...)
}
## else method == "none" -- do nothing
## Write to HDF5 file if the input is on HDF5 backend
if (xIsHDF5)
res <- HDF5Array::writeHDF5Array(res, filepath = p,
with.dimnames = TRUE)
res
}

Expand Down
8 changes: 6 additions & 2 deletions man/imputation.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

96 changes: 94 additions & 2 deletions tests/testthat/test_imputation.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,12 @@ test_that("all imputation methods", {
m <- m[m != "with"] ## see below
m <- m[m != "nbavg"] ## see next test
for (.m in m) {
xx <- impute_matrix(x, method = .m)
if (.m == "knn") {
expect_warning(xx <- impute_matrix(x, method = .m),
regexp = "more than.*entries missing")
} else {
xx <- impute_matrix(x, method = .m)
}
expect_false(any(is.na(xx)))
}
expect_error(impute_matrix(x, method = "mixed",
Expand All @@ -30,17 +35,70 @@ test_that("all imputation methods", {
mnar = "min",
mar = "knn")
expect_false(any(is.na(mx)))
## Test HDF5Matrix compatibility
if (requireNamespace("HDF5Array", quietly = TRUE)) {
## Data in HDF5
tmpf <- paste0(tempdir(), "/mhdf5")
xhdf5 <- HDF5Array::writeHDF5Array(x, tmpf, with.dimnames = TRUE)
for (.m in m) {
set.seed(1234)
if (.m == "knn") {
expect_warning(xxhdf5 <- impute_matrix(xhdf5, method = .m),
regexp = "more than.*entries missing")
} else {
xxhdf5 <- impute_matrix(xhdf5, method = .m)
}
expect_true(inherits(xxhdf5, "HDF5Matrix"))
if (.m != "MLE") { ## MLE is non-deterministic (apparently)
set.seed(1234)
if (.m == "knn") {
expect_warning(xx <- impute_matrix(x, method = .m),
regexp = "more than.*entries missing")
} else {
xx <- impute_matrix(x, method = .m)
}
expect_equal(as.matrix(xxhdf5), xx)
}
}
## Remove file
unlink(tmpf)
}
})

test_that("none method", {
xx <- impute_matrix(x, method = "none")
expect_identical(x, xx)
## Test HDF5Matrix compatibility
if (requireNamespace("HDF5Array", quietly = TRUE)) {
## Data in HDF5
tmpf <- paste0(tempdir(), "/mhdf5")
xhdf5 <- HDF5Array::writeHDF5Array(x, tmpf, with.dimnames = TRUE)
xxhdf5 <- impute_matrix(xhdf5, method = "none")
expect_true(inherits(xxhdf5, "HDF5Matrix"))
expect_equal(as.matrix(xxhdf5), xx)
## Remove file
unlink(tmpf)
}
})

test_that("zero and with method", {
x1 <- impute_matrix(x, method = "with", val = 0)
x2 <- impute_matrix(x, method = "zero")
expect_identical(x1, x2)
## Test HDF5Matrix compatibility
if (requireNamespace("HDF5Array", quietly = TRUE)) {
## Data in HDF5
tmpf <- paste0(tempdir(), "/mhdf5")
xhdf5 <- HDF5Array::writeHDF5Array(x, tmpf, with.dimnames = TRUE)
xxhdf5 <- impute_matrix(xhdf5, method = "with", val = 0)
expect_true(inherits(xxhdf5, "HDF5Matrix"))
expect_equal(as.matrix(xxhdf5), x1)
xxhdf5 <- impute_matrix(xhdf5, method = "zero")
expect_true(inherits(xxhdf5, "HDF5Matrix"))
expect_equal(as.matrix(xxhdf5), x2)
## Remove file
unlink(tmpf)
}
})

test_that("nbavg methods", {
Expand Down Expand Up @@ -76,6 +134,17 @@ test_that("nbavg methods", {
expect_true(all(is.na(xx[3, 3:4])))
expect_true(xx[5, 2] == 10)
expect_true(xx[4, 3] == 14)
## Test HDF5Matrix compatibility
if (requireNamespace("HDF5Array", quietly = TRUE)) {
## Data in HDF5
tmpf <- paste0(tempdir(), "/mhdf5")
xhdf5 <- HDF5Array::writeHDF5Array(x2, tmpf, with.dimnames = TRUE)
xxhdf5 <- impute_matrix(xhdf5, method = "nbavg", k = 0)
expect_true(inherits(xxhdf5, "HDF5Matrix"))
expect_equal(as.matrix(xxhdf5), xx)
## Remove file
unlink(tmpf)
}
})


Expand All @@ -93,9 +162,21 @@ test_that("impute: mandatory method", {
})

test_that("impute: absence of missing values", {
x_imp <- impute_matrix(x, method = "knn")
expect_warning(x_imp <- impute_matrix(x, method = "knn"),
regexp = "more than.*entries missing")
x_imp_2 <- impute_matrix(x_imp, method = "knn")
expect_identical(x_imp, x_imp_2)
## Test HDF5Matrix compatibility
if (requireNamespace("HDF5Array", quietly = TRUE)) {
## Data in HDF5
tmpf <- paste0(tempdir(), "/mhdf5")
xhdf5 <- HDF5Array::writeHDF5Array(x_imp, tmpf, with.dimnames = TRUE)
xxhdf5 <- impute_matrix(xhdf5, method = "knn")
expect_true(inherits(xxhdf5, "HDF5Matrix"))
expect_equal(as.matrix(xxhdf5), x_imp)
## Remove file
unlink(tmpf)
}
})

test_that("impute: user-provided function", {
Expand All @@ -116,4 +197,15 @@ test_that("impute: user-provided function", {
x2[1, 1] <- 1000
expect_equal(x2, x_imp)
expect_equal(x2, x_imp2)
## Test HDF5Matrix compatibility
if (requireNamespace("HDF5Array", quietly = TRUE)) {
## Data in HDF5
tmpf <- paste0(tempdir(), "/mhdf5")
xhdf5 <- HDF5Array::writeHDF5Array(x3, tmpf, with.dimnames = TRUE)
xxhdf5 <- impute_matrix(xhdf5, FUN = user_fun, val = 1000)
expect_true(inherits(xxhdf5, "HDF5Matrix"))
expect_equal(as.matrix(xxhdf5), x_imp)
## Remove file
unlink(tmpf)
}
})

0 comments on commit 45f5524

Please sign in to comment.